108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
|
|
# coding=utf-8
|
|||
|
|
import re
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
def parse_ocr_lines(lines):
|
|||
|
|
result = {
|
|||
|
|
"station_name": "",
|
|||
|
|
"distance": "",
|
|||
|
|
"price": None,
|
|||
|
|
"piles": [],
|
|||
|
|
"parking": "",
|
|||
|
|
"tags": []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Pre-process lines: remove confidence scores for parsing
|
|||
|
|
clean_lines = []
|
|||
|
|
for line in lines:
|
|||
|
|
# Remove prefix "行 X: "
|
|||
|
|
text = re.sub(r"^行\s*\d+:\s*", "", line)
|
|||
|
|
# Remove suffix "(置信度: ...)"
|
|||
|
|
text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
|
|||
|
|
clean_lines.append(text.strip())
|
|||
|
|
|
|||
|
|
# 1. Station Name
|
|||
|
|
for line in clean_lines:
|
|||
|
|
# Skip empty or short noise
|
|||
|
|
if len(line) < 2: continue
|
|||
|
|
# Skip if starts with special chars
|
|||
|
|
if line.startswith("(") or line.startswith("("): continue
|
|||
|
|
# Skip if contains typical attribute keywords
|
|||
|
|
if "km" in line.lower() or "/度" in line or "¥" in line: continue
|
|||
|
|
# Skip if strictly numeric (unlikely for name)
|
|||
|
|
if re.match(r"^\d+$", line): continue
|
|||
|
|
|
|||
|
|
result["station_name"] = line
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 2. Distance
|
|||
|
|
for line in clean_lines:
|
|||
|
|
m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE)
|
|||
|
|
if m:
|
|||
|
|
result["distance"] = m.group(0)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 3. Price (Standard)
|
|||
|
|
# Look for "0.xxxx/度"
|
|||
|
|
for line in clean_lines:
|
|||
|
|
m = re.search(r"(\d+\.\d+)(?=/度)", line)
|
|||
|
|
if m:
|
|||
|
|
result["price"] = float(m.group(1))
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 4. Piles
|
|||
|
|
current_type = "未知"
|
|||
|
|
for line in clean_lines:
|
|||
|
|
if "快" in line: current_type = "快"
|
|||
|
|
elif "慢" in line: current_type = "慢"
|
|||
|
|
elif "超" in line: current_type = "超"
|
|||
|
|
|
|||
|
|
# Match "闲3/4" or "3/4"
|
|||
|
|
# Regex: optional "闲", int, /, int
|
|||
|
|
m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
|
|||
|
|
if m:
|
|||
|
|
# Check if it looks like a price (contains dot)
|
|||
|
|
if "." in line: continue
|
|||
|
|
|
|||
|
|
free = int(m.group(1))
|
|||
|
|
total = int(m.group(2))
|
|||
|
|
result["piles"].append({
|
|||
|
|
"type": current_type,
|
|||
|
|
"free": free,
|
|||
|
|
"total": total
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 5. Parking
|
|||
|
|
for line in clean_lines:
|
|||
|
|
if "停车" in line:
|
|||
|
|
# Clean up leading punctuation
|
|||
|
|
cleaned = re.sub(r"^[·\.\sP]+", "", line)
|
|||
|
|
result["parking"] = cleaned
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# User provided sample data
|
|||
|
|
sample_input = [
|
|||
|
|
"行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)",
|
|||
|
|
"行 2: (… (置信度: 0.6244)",
|
|||
|
|
"行 3: 7.4km (置信度: 0.9975)",
|
|||
|
|
"行 4: 0.7111/度 (置信度: 0.9450)",
|
|||
|
|
"行 5: 快 (置信度: 0.9987)",
|
|||
|
|
"行 6: 闲3/4 (置信度: 0.9941)",
|
|||
|
|
"行 7: ¥ (置信度: 0.8734)",
|
|||
|
|
"行 8: 组团 (置信度: 0.9995)",
|
|||
|
|
"行 9: 2倍积分 (置信度: 0.9997)",
|
|||
|
|
"行 10: P (置信度: 0.9929)",
|
|||
|
|
"行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736)"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
print("--- Input Data ---")
|
|||
|
|
for l in sample_input:
|
|||
|
|
print(l)
|
|||
|
|
|
|||
|
|
parsed = parse_ocr_lines(sample_input)
|
|||
|
|
print("\n--- Parsed Result (Regex) ---")
|
|||
|
|
print(json.dumps(parsed, ensure_ascii=False, indent=2))
|