# coding=utf-8 import re import json def parse_ocr_lines(lines): result = { "station_name": "", "distance": "", "price": None, "piles": [], "parking": "", "tags": [] } # Pre-process lines: remove confidence scores for parsing clean_lines = [] for line in lines: # Remove prefix "行 X: " text = re.sub(r"^行\s*\d+:\s*", "", line) # Remove suffix "(置信度: ...)" text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text) clean_lines.append(text.strip()) # 1. Station Name for line in clean_lines: # Skip empty or short noise if len(line) < 2: continue # Skip if starts with special chars if line.startswith("(") or line.startswith("("): continue # Skip if contains typical attribute keywords if "km" in line.lower() or "/度" in line or "¥" in line: continue # Skip if strictly numeric (unlikely for name) if re.match(r"^\d+$", line): continue result["station_name"] = line break # 2. Distance for line in clean_lines: m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE) if m: result["distance"] = m.group(0) break # 3. Price (Standard) # Look for "0.xxxx/度" for line in clean_lines: m = re.search(r"(\d+\.\d+)(?=/度)", line) if m: result["price"] = float(m.group(1)) break # 4. Piles current_type = "未知" for line in clean_lines: if "快" in line: current_type = "快" elif "慢" in line: current_type = "慢" elif "超" in line: current_type = "超" # Match "闲3/4" or "3/4" # Regex: optional "闲", int, /, int m = re.search(r"(?:闲)?(\d+)/(\d+)", line) if m: # Check if it looks like a price (contains dot) if "." in line: continue free = int(m.group(1)) total = int(m.group(2)) result["piles"].append({ "type": current_type, "free": free, "total": total }) # 5. Parking for line in clean_lines: if "停车" in line: # Clean up leading punctuation cleaned = re.sub(r"^[·\.\sP]+", "", line) result["parking"] = cleaned break return result if __name__ == "__main__": # User provided sample data sample_input = [ "行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)", "行 2: (… (置信度: 0.6244)", "行 3: 7.4km (置信度: 0.9975)", "行 4: 0.7111/度 (置信度: 0.9450)", "行 5: 快 (置信度: 0.9987)", "行 6: 闲3/4 (置信度: 0.9941)", "行 7: ¥ (置信度: 0.8734)", "行 8: 组团 (置信度: 0.9995)", "行 9: 2倍积分 (置信度: 0.9997)", "行 10: P (置信度: 0.9929)", "行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736)" ] print("--- Input Data ---") for l in sample_input: print(l) parsed = parse_ocr_lines(sample_input) print("\n--- Parsed Result (Regex) ---") print(json.dumps(parsed, ensure_ascii=False, indent=2))