127 lines
4.8 KiB
Python
127 lines
4.8 KiB
Python
# coding=utf-8
|
||
import re
|
||
|
||
class OcrParser:
|
||
@staticmethod
|
||
def parse(ocr_lines):
|
||
"""
|
||
解析 OCR 文本行,提取场站结构化数据。
|
||
不依赖 LLM,使用规则和正则匹配。
|
||
"""
|
||
result = {
|
||
"station_name": "",
|
||
"distance": "",
|
||
"price": None,
|
||
"piles": [],
|
||
"parking": "",
|
||
"tags": []
|
||
}
|
||
|
||
# 预处理:清洗无关字符
|
||
clean_lines = []
|
||
for line in ocr_lines:
|
||
# 去除 "行 X: " 前缀
|
||
text = re.sub(r"^行\s*\d+:\s*", "", line)
|
||
# 去除 "(置信度: ...)" 后缀
|
||
text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
|
||
text = text.strip()
|
||
if text:
|
||
clean_lines.append(text)
|
||
|
||
# 1. 场站名称 (Station Name)
|
||
# 策略:通常是第一行,且不包含特定关键词(如km, /度, ¥等)
|
||
for line in clean_lines:
|
||
if len(line) < 2: continue
|
||
if line.startswith("(") or line.startswith("(") or line == "…": continue
|
||
|
||
# 排除明显属性行
|
||
if any(k in line.lower() for k in ["km", "/度", "¥", "停车", "积分", "组团"]):
|
||
continue
|
||
|
||
# 排除纯数字
|
||
if re.match(r"^\d+$", line): continue
|
||
|
||
# 排除可能是充电枪信息的行 (例如 "闲2/4", "快", "慢")
|
||
if re.search(r"\d+/\d+", line): continue
|
||
if line in ["快", "慢", "超", "快充", "慢充", "超充"]: continue
|
||
|
||
result["station_name"] = line
|
||
break
|
||
|
||
# 2. 距离 (Distance)
|
||
for line in clean_lines:
|
||
# 匹配 "7.4km", "17.4km", "90m"
|
||
m = re.search(r"(\d+(\.\d+)?)\s*(km|m)", line, re.IGNORECASE)
|
||
if m:
|
||
result["distance"] = m.group(0)
|
||
break
|
||
|
||
# 3. 价格 (Price)
|
||
for line in clean_lines:
|
||
# 匹配 "0.7111/度", "1.2/度"
|
||
m = re.search(r"(\d+\.\d+)(?=/度)", line)
|
||
if m:
|
||
try:
|
||
result["price"] = float(m.group(1))
|
||
break
|
||
except: pass
|
||
|
||
# 4. 充电枪信息 (Piles)
|
||
# 规则:查找 "快", "慢", "超" 以及 "x/y" 格式
|
||
# 示例: "快", "闲3/4" (跨行) 或 "快充 3/4" (同行)
|
||
current_type = "未知"
|
||
|
||
# 辅助:先尝试在单行内同时匹配类型和数量
|
||
# 如果单行内有 "快" 和 "3/4",则直接提取
|
||
# 如果单行只有类型,更新 current_type
|
||
# 如果单行只有 "3/4",使用 current_type
|
||
|
||
for line in clean_lines:
|
||
# 更新当前类型上下文
|
||
if "快" in line: current_type = "快"
|
||
elif "慢" in line: current_type = "慢"
|
||
elif "超" in line: current_type = "超"
|
||
|
||
# 匹配 "3/4" 或 "闲3/4"
|
||
# 排除价格行 (包含小数点,如 0.7111/度 这里的 / 前后也可能是数字,需小心)
|
||
if "/度" in line: continue
|
||
|
||
# 匹配分数结构:整数/整数
|
||
# (?:闲)? 可选的前缀 "闲"
|
||
m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
|
||
if m:
|
||
# 再次确认不是价格(有时候 OCR 会把 . 漏掉,变成 07111/度,但通常会有 /度)
|
||
# 简单校验:总数通常不会特别大,比如不会是 2024/12 (日期)
|
||
try:
|
||
free = int(m.group(1))
|
||
total = int(m.group(2))
|
||
|
||
# 过滤可能的日期或异常值
|
||
if total > 500: continue
|
||
|
||
result["piles"].append({
|
||
"type": current_type,
|
||
"free": free,
|
||
"total": total
|
||
})
|
||
except: pass
|
||
|
||
# 5. 停车信息 (Parking)
|
||
for line in clean_lines:
|
||
if "停车" in line:
|
||
# 清理前缀符号,如 "·收费停车" -> "收费停车"
|
||
clean_text = re.sub(r"^[·\.\sP]+", "", line)
|
||
result["parking"] = clean_text
|
||
break
|
||
|
||
# 6. 标签 (Tags)
|
||
# 收集除上述已解析字段外的简短关键词
|
||
known_keywords = ["快", "慢", "超", "P", "组团", "积分"]
|
||
for line in clean_lines:
|
||
# 简单匹配一些常见 Tag
|
||
if "积分" in line: result["tags"].append(line)
|
||
elif "组团" in line: result["tags"].append("组团")
|
||
elif line.strip() == "P": result["tags"].append("P")
|
||
|
||
return result
|