Files
aiData/Util/OcrParser.py
HuangHai ac79e44282 'commit'
2026-01-12 20:11:18 +08:00

127 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import re
class OcrParser:
@staticmethod
def parse(ocr_lines):
"""
解析 OCR 文本行,提取场站结构化数据。
不依赖 LLM使用规则和正则匹配。
"""
result = {
"station_name": "",
"distance": "",
"price": None,
"piles": [],
"parking": "",
"tags": []
}
# 预处理:清洗无关字符
clean_lines = []
for line in ocr_lines:
# 去除 "行 X: " 前缀
text = re.sub(r"^行\s*\d+:\s*", "", line)
# 去除 "(置信度: ...)" 后缀
text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
text = text.strip()
if text:
clean_lines.append(text)
# 1. 场站名称 (Station Name)
# 策略通常是第一行且不包含特定关键词如km, /度, ¥等)
for line in clean_lines:
if len(line) < 2: continue
if line.startswith("") or line.startswith("(") or line == "": continue
# 排除明显属性行
if any(k in line.lower() for k in ["km", "/度", "", "停车", "积分", "组团"]):
continue
# 排除纯数字
if re.match(r"^\d+$", line): continue
# 排除可能是充电枪信息的行 (例如 "闲2/4", "快", "慢")
if re.search(r"\d+/\d+", line): continue
if line in ["", "", "", "快充", "慢充", "超充"]: continue
result["station_name"] = line
break
# 2. 距离 (Distance)
for line in clean_lines:
# 匹配 "7.4km", "17.4km", "90m"
m = re.search(r"(\d+(\.\d+)?)\s*(km|m)", line, re.IGNORECASE)
if m:
result["distance"] = m.group(0)
break
# 3. 价格 (Price)
for line in clean_lines:
# 匹配 "0.7111/度", "1.2/度"
m = re.search(r"(\d+\.\d+)(?=/度)", line)
if m:
try:
result["price"] = float(m.group(1))
break
except: pass
# 4. 充电枪信息 (Piles)
# 规则:查找 "快", "慢", "超" 以及 "x/y" 格式
# 示例: "快", "闲3/4" (跨行) 或 "快充 3/4" (同行)
current_type = "未知"
# 辅助:先尝试在单行内同时匹配类型和数量
# 如果单行内有 "快" 和 "3/4",则直接提取
# 如果单行只有类型,更新 current_type
# 如果单行只有 "3/4",使用 current_type
for line in clean_lines:
# 更新当前类型上下文
if "" in line: current_type = ""
elif "" in line: current_type = ""
elif "" in line: current_type = ""
# 匹配 "3/4" 或 "闲3/4"
# 排除价格行 (包含小数点,如 0.7111/度 这里的 / 前后也可能是数字,需小心)
if "/度" in line: continue
# 匹配分数结构:整数/整数
# (?:闲)? 可选的前缀 "闲"
m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
if m:
# 再次确认不是价格(有时候 OCR 会把 . 漏掉,变成 07111/度,但通常会有 /度)
# 简单校验:总数通常不会特别大,比如不会是 2024/12 (日期)
try:
free = int(m.group(1))
total = int(m.group(2))
# 过滤可能的日期或异常值
if total > 500: continue
result["piles"].append({
"type": current_type,
"free": free,
"total": total
})
except: pass
# 5. 停车信息 (Parking)
for line in clean_lines:
if "停车" in line:
# 清理前缀符号,如 "·收费停车" -> "收费停车"
clean_text = re.sub(r"^[·\.\sP]+", "", line)
result["parking"] = clean_text
break
# 6. 标签 (Tags)
# 收集除上述已解析字段外的简短关键词
known_keywords = ["", "", "", "P", "组团", "积分"]
for line in clean_lines:
# 简单匹配一些常见 Tag
if "积分" in line: result["tags"].append(line)
elif "组团" in line: result["tags"].append("组团")
elif line.strip() == "P": result["tags"].append("P")
return result