Files
aiData/Test/TestOcrRegex.py
HuangHai b66f683dfb 'commit'
2026-01-12 07:49:18 +08:00

108 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import re
import json
def parse_ocr_lines(lines):
result = {
"station_name": "",
"distance": "",
"price": None,
"piles": [],
"parking": "",
"tags": []
}
# Pre-process lines: remove confidence scores for parsing
clean_lines = []
for line in lines:
# Remove prefix "行 X: "
text = re.sub(r"^行\s*\d+:\s*", "", line)
# Remove suffix "(置信度: ...)"
text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
clean_lines.append(text.strip())
# 1. Station Name
for line in clean_lines:
# Skip empty or short noise
if len(line) < 2: continue
# Skip if starts with special chars
if line.startswith("") or line.startswith("("): continue
# Skip if contains typical attribute keywords
if "km" in line.lower() or "/度" in line or "" in line: continue
# Skip if strictly numeric (unlikely for name)
if re.match(r"^\d+$", line): continue
result["station_name"] = line
break
# 2. Distance
for line in clean_lines:
m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE)
if m:
result["distance"] = m.group(0)
break
# 3. Price (Standard)
# Look for "0.xxxx/度"
for line in clean_lines:
m = re.search(r"(\d+\.\d+)(?=/度)", line)
if m:
result["price"] = float(m.group(1))
break
# 4. Piles
current_type = "未知"
for line in clean_lines:
if "" in line: current_type = ""
elif "" in line: current_type = ""
elif "" in line: current_type = ""
# Match "闲3/4" or "3/4"
# Regex: optional "闲", int, /, int
m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
if m:
# Check if it looks like a price (contains dot)
if "." in line: continue
free = int(m.group(1))
total = int(m.group(2))
result["piles"].append({
"type": current_type,
"free": free,
"total": total
})
# 5. Parking
for line in clean_lines:
if "停车" in line:
# Clean up leading punctuation
cleaned = re.sub(r"^[·\.\sP]+", "", line)
result["parking"] = cleaned
break
return result
if __name__ == "__main__":
# User provided sample data
sample_input = [
"行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)",
"行 2: (… (置信度: 0.6244)",
"行 3: 7.4km (置信度: 0.9975)",
"行 4: 0.7111/度 (置信度: 0.9450)",
"行 5: 快 (置信度: 0.9987)",
"行 6: 闲3/4 (置信度: 0.9941)",
"行 7: ¥ (置信度: 0.8734)",
"行 8: 组团 (置信度: 0.9995)",
"行 9: 2倍积分 (置信度: 0.9997)",
"行 10: P (置信度: 0.9929)",
"行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736)"
]
print("--- Input Data ---")
for l in sample_input:
print(l)
parsed = parse_ocr_lines(sample_input)
print("\n--- Parsed Result (Regex) ---")
print(json.dumps(parsed, ensure_ascii=False, indent=2))