Files
aiData/Test/TestOcrRegex.py

108 lines
3.3 KiB
Python
Raw Normal View History

2026-01-12 07:49:18 +08:00
# coding=utf-8
import re
import json
def parse_ocr_lines(lines):
result = {
"station_name": "",
"distance": "",
"price": None,
"piles": [],
"parking": "",
"tags": []
}
# Pre-process lines: remove confidence scores for parsing
clean_lines = []
for line in lines:
# Remove prefix "行 X: "
text = re.sub(r"^行\s*\d+:\s*", "", line)
# Remove suffix "(置信度: ...)"
text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
clean_lines.append(text.strip())
# 1. Station Name
for line in clean_lines:
# Skip empty or short noise
if len(line) < 2: continue
# Skip if starts with special chars
if line.startswith("") or line.startswith("("): continue
# Skip if contains typical attribute keywords
if "km" in line.lower() or "/度" in line or "" in line: continue
# Skip if strictly numeric (unlikely for name)
if re.match(r"^\d+$", line): continue
result["station_name"] = line
break
# 2. Distance
for line in clean_lines:
m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE)
if m:
result["distance"] = m.group(0)
break
# 3. Price (Standard)
# Look for "0.xxxx/度"
for line in clean_lines:
m = re.search(r"(\d+\.\d+)(?=/度)", line)
if m:
result["price"] = float(m.group(1))
break
# 4. Piles
current_type = "未知"
for line in clean_lines:
if "" in line: current_type = ""
elif "" in line: current_type = ""
elif "" in line: current_type = ""
# Match "闲3/4" or "3/4"
# Regex: optional "闲", int, /, int
m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
if m:
# Check if it looks like a price (contains dot)
if "." in line: continue
free = int(m.group(1))
total = int(m.group(2))
result["piles"].append({
"type": current_type,
"free": free,
"total": total
})
# 5. Parking
for line in clean_lines:
if "停车" in line:
# Clean up leading punctuation
cleaned = re.sub(r"^[·\.\sP]+", "", line)
result["parking"] = cleaned
break
return result
if __name__ == "__main__":
# User provided sample data
sample_input = [
"行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)",
"行 2: (… (置信度: 0.6244)",
"行 3: 7.4km (置信度: 0.9975)",
"行 4: 0.7111/度 (置信度: 0.9450)",
"行 5: 快 (置信度: 0.9987)",
"行 6: 闲3/4 (置信度: 0.9941)",
"行 7: ¥ (置信度: 0.8734)",
"行 8: 组团 (置信度: 0.9995)",
"行 9: 2倍积分 (置信度: 0.9997)",
"行 10: P (置信度: 0.9929)",
"行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736)"
]
print("--- Input Data ---")
for l in sample_input:
print(l)
parsed = parse_ocr_lines(sample_input)
print("\n--- Parsed Result (Regex) ---")
print(json.dumps(parsed, ensure_ascii=False, indent=2))