aiData/Test/TestOcrRegex.py

# coding=utf-8
import re
import json

def parse_ocr_lines(lines):
    result = {
        "station_name": "",
        "distance": "",
        "price": None,
        "piles": [],
        "parking": "",
        "tags": []
    }

    # Pre-process lines: remove confidence scores for parsing
    clean_lines = []
    for line in lines:
        # Remove prefix "行 X: "
        text = re.sub(r"^行\s*\d+:\s*", "", line)
        # Remove suffix "(置信度: ...)"
        text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text)
        clean_lines.append(text.strip())

    # 1. Station Name
    for line in clean_lines:
        # Skip empty or short noise
        if len(line) < 2: continue
        # Skip if starts with special chars
        if line.startswith("（") or line.startswith("("): continue
        # Skip if contains typical attribute keywords
        if "km" in line.lower() or "/度" in line or "￥" in line: continue
        # Skip if strictly numeric (unlikely for name)
        if re.match(r"^\d+$", line): continue

        result["station_name"] = line
        break

    # 2. Distance
    for line in clean_lines:
        m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE)
        if m:
            result["distance"] = m.group(0)
            break

    # 3. Price (Standard)
    # Look for "0.xxxx/度"
    for line in clean_lines:
        m = re.search(r"(\d+\.\d+)(?=/度)", line)
        if m:
            result["price"] = float(m.group(1))
            break

    # 4. Piles
    current_type = "未知"
    for line in clean_lines:
        if "快" in line: current_type = "快"
        elif "慢" in line: current_type = "慢"
        elif "超" in line: current_type = "超"

        # Match "闲3/4" or "3/4"
        # Regex: optional "闲", int, /, int
        m = re.search(r"(?:闲)?(\d+)/(\d+)", line)
        if m:
            # Check if it looks like a price (contains dot)
            if "." in line: continue

            free = int(m.group(1))
            total = int(m.group(2))
            result["piles"].append({
                "type": current_type,
                "free": free,
                "total": total
            })

    # 5. Parking
    for line in clean_lines:
        if "停车" in line:
            # Clean up leading punctuation
            cleaned = re.sub(r"^[·\.\sP]+", "", line)
            result["parking"] = cleaned
            break

    return result

if __name__ == "__main__":
    # User provided sample data
    sample_input = [
        "行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)",
        "行 2: （… (置信度: 0.6244)",
        "行 3: 7.4km (置信度: 0.9975)",
        "行 4: 0.7111/度 (置信度: 0.9450)",
        "行 5: 快 (置信度: 0.9987)",
        "行 6: 闲3/4 (置信度: 0.9941)",
        "行 7: ￥ (置信度: 0.8734)",
        "行 8: 组团 (置信度: 0.9995)",
        "行 9: 2倍积分 (置信度: 0.9997)",
        "行 10: P (置信度: 0.9929)",
        "行 11: ·收费停车：以场地实际收费为准 (置信度: 0.9736)"
    ]

    print("--- Input Data ---")
    for l in sample_input:
        print(l)

    parsed = parse_ocr_lines(sample_input)
    print("\n--- Parsed Result (Regex) ---")
    print(json.dumps(parsed, ensure_ascii=False, indent=2))