108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
# coding=utf-8
|
||
import re
|
||
import json
|
||
|
||
def parse_ocr_lines(lines):
|
||
result = {
|
||
"station_name": "",
|
||
"distance": "",
|
||
"price": None,
|
||
"piles": [],
|
||
"parking": "",
|
||
"tags": []
|
||
}
|
||
|
||
# Pre-process lines: remove confidence scores for parsing
|
||
clean_lines = []
|
||
for line in lines:
|
||
# Remove prefix "行 X: "
|
||
text = re.sub(r"^行\s*\d+:\s*", "", line)
|
||
# Remove suffix "(置信度: ...)"
|
||
text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
|
||
clean_lines.append(text.strip())
|
||
|
||
# 1. Station Name
|
||
for line in clean_lines:
|
||
# Skip empty or short noise
|
||
if len(line) < 2: continue
|
||
# Skip if starts with special chars
|
||
if line.startswith("(") or line.startswith("("): continue
|
||
# Skip if contains typical attribute keywords
|
||
if "km" in line.lower() or "/度" in line or "¥" in line: continue
|
||
# Skip if strictly numeric (unlikely for name)
|
||
if re.match(r"^\d+$", line): continue
|
||
|
||
result["station_name"] = line
|
||
break
|
||
|
||
# 2. Distance
|
||
for line in clean_lines:
|
||
m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE)
|
||
if m:
|
||
result["distance"] = m.group(0)
|
||
break
|
||
|
||
# 3. Price (Standard)
|
||
# Look for "0.xxxx/度"
|
||
for line in clean_lines:
|
||
m = re.search(r"(\d+\.\d+)(?=/度)", line)
|
||
if m:
|
||
result["price"] = float(m.group(1))
|
||
break
|
||
|
||
# 4. Piles
|
||
current_type = "未知"
|
||
for line in clean_lines:
|
||
if "快" in line: current_type = "快"
|
||
elif "慢" in line: current_type = "慢"
|
||
elif "超" in line: current_type = "超"
|
||
|
||
# Match "闲3/4" or "3/4"
|
||
# Regex: optional "闲", int, /, int
|
||
m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
|
||
if m:
|
||
# Check if it looks like a price (contains dot)
|
||
if "." in line: continue
|
||
|
||
free = int(m.group(1))
|
||
total = int(m.group(2))
|
||
result["piles"].append({
|
||
"type": current_type,
|
||
"free": free,
|
||
"total": total
|
||
})
|
||
|
||
# 5. Parking
|
||
for line in clean_lines:
|
||
if "停车" in line:
|
||
# Clean up leading punctuation
|
||
cleaned = re.sub(r"^[·\.\sP]+", "", line)
|
||
result["parking"] = cleaned
|
||
break
|
||
|
||
return result
|
||
|
||
if __name__ == "__main__":
|
||
# User provided sample data
|
||
sample_input = [
|
||
"行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)",
|
||
"行 2: (… (置信度: 0.6244)",
|
||
"行 3: 7.4km (置信度: 0.9975)",
|
||
"行 4: 0.7111/度 (置信度: 0.9450)",
|
||
"行 5: 快 (置信度: 0.9987)",
|
||
"行 6: 闲3/4 (置信度: 0.9941)",
|
||
"行 7: ¥ (置信度: 0.8734)",
|
||
"行 8: 组团 (置信度: 0.9995)",
|
||
"行 9: 2倍积分 (置信度: 0.9997)",
|
||
"行 10: P (置信度: 0.9929)",
|
||
"行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736)"
|
||
]
|
||
|
||
print("--- Input Data ---")
|
||
for l in sample_input:
|
||
print(l)
|
||
|
||
parsed = parse_ocr_lines(sample_input)
|
||
print("\n--- Parsed Result (Regex) ---")
|
||
print(json.dumps(parsed, ensure_ascii=False, indent=2))
|