import numpy as np from PIL import Image import os import asyncio import hashlib import json import aiohttp import logging import base64 from openai import OpenAI, BadRequestError from Config.Config import ( ALY_LLM_API_KEY, VL_MODEL_NAME, VL_MODEL_NAME_AD, SAFE_EXCLUDE_RATIO, FALLBACK_WIDTH, FALLBACK_HEIGHT, BOTTOM_SAFE_EXCLUDE_RATIO ) logger = logging.getLogger(__name__) class XinDianTuReadImageKit: _client = OpenAI( api_key=ALY_LLM_API_KEY, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) # 通用回退设备信息,仅在无法动态获取设备信息时使用 _FALLBACK_DEVICE_INFO = { "displayWidth": FALLBACK_WIDTH, "displayHeight": FALLBACK_HEIGHT, "productName": "generic" } _prompt = ( "仅输出JSON数组(不含任何说明文字),按从左到右、从上到下的顺序识别图片中的充电站区域。识别规则如下:\n" "1. 必须是卡片形式的充电站信息区域。\n" "2. 每一个卡片必须同时具备以下所有要素,否则严禁识别:\n" " - 场站名称 (station_name);\n" " - 距离信息 (distance, 例如 '5.3km'),位于卡片右上角;\n" " - 金额/电费 (price,例如 '0.8490');\n" " - 充电枪信息 (piles,包含'超'、'快'或'慢'的类型、总枪数和空闲枪数,例如 '快 闲4/4')。\n" "3. 如果缺少上述任何一项要素(例如只有名称和距离,但没有电费或枪数信息),说明它不是真正的场站卡片(可能是广告或功能入口),请直接跳过。\n" "\n" "JSON对象字段要求:\n" "1. b_use: 状态标识(1或0)。如果场站名称为灰色或带有“暂停使用”等标签,则为0,否则为1。\n" "2. station_name: 场站名称;\n" "3. price: 一度电的价格(数字);\n" "4. pro_price: Pro会员价格(数字),无则为null;\n" "5. piles: 充电枪列表 [{type: '快', free: 4, total: 4}];\n" "6. parking: 停车费用描述(通常在'P'图标后,例如 '收费停车:以场站实际收费规则为准' 或 '限时免费停车...')。\n" "7. distance: 距离信息字符串(例如 '5.3km')。\n" "8. bounds: {x1,y1,x2,y2} 区域像素坐标(0-1000);\n" "9. bounds_norm: {left,top,right,bottom} 归一化坐标(0-1);\n" "10. station_name_bounds: 场站名称文字区域坐标 {x1,y1,x2,y2}(0-1000);\n" "11. station_name_bounds_norm: 场站名称文字归一化坐标(0-1)。\n" "\n" "重要约束(违反者不予识别):\n" "A. 严禁识别广告位和筛选标签。如“夜间免停”、“洗手间”、“不限车长”、“不限车高”、“组团”、“综合排序”等均不是场站。\n" "B. 真正的场站卡片必须是一个横跨屏幕的大卡片,包含:场站名称(大号加粗)、金额(¥开头)、距离(km结尾)、充电枪状态(闲x/x)。\n" "C. 严禁将屏幕中间的筛选标签误认为场站卡片。\n" "\n" "严格返回纯JSON格式。" ) @staticmethod def _extract_json(text: str) -> str: if not text: return "[]" cleaned = text.strip() if "```" in cleaned: lines = [] for line in cleaned.splitlines(): if line.strip().startswith("```"): continue lines.append(line) cleaned = "\n".join(lines).strip() decoder = json.JSONDecoder() pos = 0 while pos < len(cleaned): idx_dict = cleaned.find("{", pos) idx_list = cleaned.find("[", pos) candidates = [i for i in (idx_dict, idx_list) if i != -1] if not candidates: break start = min(candidates) snippet = cleaned[start:] try: _, end = decoder.raw_decode(snippet) return snippet[:end] except json.JSONDecodeError: pos = start + 1 continue return "[]" @staticmethod def _add_center(obj, device_info): return XinDianTuReadImageKit._add_click_point(obj, device_info, anchor="center") @staticmethod def _add_click_point(obj, device_info, anchor: str = "center"): # 获取显示宽高,增加容错 display_width = float(device_info.get("displayWidth") or device_info.get("width") or 1080) display_height = float(device_info.get("displayHeight") or device_info.get("height") or 2400) bounds = obj.get("bounds") bn = obj.get("bounds_norm") text_bounds = obj.get("station_name_bounds") text_bn = obj.get("station_name_bounds_norm") uia_x = None uia_y = None def get_pixel_coords(b_data, d_w, d_h): if not b_data: return None # Extract values based on dict or list if isinstance(b_data, list) and len(b_data) == 4: v1, v2, v3, v4 = b_data elif isinstance(b_data, dict): # 兼容多种可能的键名 v1 = b_data.get("left") if b_data.get("left") is not None else b_data.get("x1") v2 = b_data.get("top") if b_data.get("top") is not None else b_data.get("y1") v3 = b_data.get("right") if b_data.get("right") is not None else b_data.get("x2") v4 = b_data.get("bottom") if b_data.get("bottom") is not None else b_data.get("y2") if any(v is None for v in (v1, v2, v3, v4)): return None else: return None try: v1, v2, v3, v4 = float(v1), float(v2), float(v3), float(v4) except (ValueError, TypeError): return None max_v = max(v1, v2, v3, v4) # 1. 0-1 归一化坐标 if max_v <= 1.05: x1, y1, x2, y2 = v1 * d_w, v2 * d_h, v3 * d_w, v4 * d_h # 2. 0-1000 归一化坐标 (Qwen-VL 常用) elif max_v <= 1005: x1, y1, x2, y2 = (v1 / 1000.0) * d_w, (v2 / 1000.0) * d_h, (v3 / 1000.0) * d_w, (v4 / 1000.0) * d_h # 3. 绝对像素坐标 else: x1, y1, x2, y2 = v1, v2, v3, v4 return min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2) # 优先使用场站名称区域进行点击 (最安全) coords = get_pixel_coords(text_bn, display_width, display_height) or get_pixel_coords(text_bounds, display_width, display_height) if coords: x1, y1, x2, y2 = coords uia_x = int(x1 + (x2 - x1) / 2) uia_y = int(y1 + (y2 - y1) / 2) logger.info(f"坐标计算: 使用文字区域 -> ({uia_x}, {uia_y}) | 区域: {coords} | 屏幕: {display_width}x{display_height}") # 备选:使用整个卡片区域 if uia_x is None: coords = get_pixel_coords(bn, display_width, display_height) or get_pixel_coords(bounds, display_width, display_height) if coords: x1, y1, x2, y2 = coords w, h = x2 - x1, y2 - y1 if anchor == "top_left": uia_x = int(x1 + max(5.0, w * 0.15)) uia_y = int(y1 + max(5.0, h * 0.20)) else: # 默认中心点,但稍微偏上一点,避开底部可能的按钮 uia_x = int(x1 + w / 2) uia_y = int(y1 + h * 0.4) logger.info(f"坐标计算: 使用卡片区域 -> ({uia_x}, {uia_y}) | 区域: {coords} | 屏幕: {display_width}x{display_height}") # --- 安全过滤:过滤掉屏幕顶部的点击坐标 (通常是广告或菜单) --- if uia_y is not None: # 如果点击点在屏幕顶部 SAFE_EXCLUDE_RATIO 范围内,极大概率是误触广告位,将其排除 if uia_y < (display_height * SAFE_EXCLUDE_RATIO): logger.warning(f"安全排除: 坐标 ({uia_x}, {uia_y}) 位于屏幕顶部 {int(SAFE_EXCLUDE_RATIO*100)}% 区域,疑似广告或菜单,已忽略。") return None # 如果点击点在屏幕底部 BOTTOM_SAFE_EXCLUDE_RATIO 范围内,极大概率是误触底部功能区(如扫码充电),将其排除 if uia_y > (display_height * (1 - BOTTOM_SAFE_EXCLUDE_RATIO)): logger.warning(f"安全排除: 坐标 ({uia_x}, {uia_y}) 位于屏幕底部 {int(BOTTOM_SAFE_EXCLUDE_RATIO*100)}% 区域,疑似底部功能区,已忽略。") return None if uia_x is not None and uia_y is not None: obj["uia_center_x"] = uia_x obj["uia_center_y"] = uia_y # Clean up temporary fields for k in ["bounds", "bounds_norm", "station_name_bounds", "station_name_bounds_norm"]: if k in obj: del obj[k] return obj @staticmethod async def find_all_time_button_coordinate(image_url: str, device_info: dict = None) -> dict: if device_info is None: logger.warning("未提供动态设备信息,使用通用回退配置。") device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO prompt = ( "仅输出JSON对象(不含任何说明文字),请找到图片中带有“全部时段”字样的按钮区域(通常在价格表下方,是一个带有右箭头的文字按钮)。\n" "返回格式示例:\n" "{\n" ' "bounds": {"x1": 100, "y1": 200, "x2": 300, "y2": 400}, \n' ' "bounds_norm": {"left": 0.1, "top": 0.2, "right": 0.3, "bottom": 0.4}\n' "}\n" "注意:bounds应使用0-1000的归一化坐标空间。\n" "如果未找到,返回空JSON {}。" ) loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": image_url }, }, {"type": "text", "text": json.dumps(device_info, ensure_ascii=False)}, {"type": "text", "text": prompt}, ], }, ], ) ) content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) try: data = json.loads(raw) if isinstance(data, dict) and (data.get("bounds") or data.get("bounds_norm")): data = XinDianTuReadImageKit._add_center(data, device_info) # 只返回中心坐标 return { "uia_center_x": data.get("uia_center_x"), "uia_center_y": data.get("uia_center_y") } return {} except Exception as e: logger.error(f"Error parsing JSON: {e}") logger.error(f"Raw content: {raw}") return {} @staticmethod async def find_station_coordinate_first_page(image_url: str, station_name: str, device_info: dict = None) -> dict: if device_info is None: logger.warning("未提供动态设备信息,使用通用回退配置。") device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO prompt = ( "仅输出JSON对象(不含任何说明文字)。" "请在图片中找到场站名称完全匹配“" + str(station_name) + "”的那一行/卡片," "返回该场站名称文字本身的区域坐标(优先返回归一化坐标)。" "返回格式示例:" "{\"station_name_bounds\": {\"x1\": 100, \"y1\": 200, \"x2\": 300, \"y2\": 240}, " "\"station_name_bounds_norm\": {\"left\": 0.1, \"top\": 0.2, \"right\": 0.3, \"bottom\": 0.24}}" "注意:bounds使用0-1000归一化坐标空间;如果找不到,返回空JSON {}。" ) loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": json.dumps(device_info, ensure_ascii=False)}, {"type": "text", "text": prompt}, ], }, ], ) ) content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) try: data = json.loads(raw) if isinstance(data, dict) and (data.get("station_name_bounds") or data.get("station_name_bounds_norm") or data.get("bounds") or data.get("bounds_norm")): data = XinDianTuReadImageKit._add_click_point(data, device_info, anchor="station_text") return { "uia_center_x": data.get("uia_center_x"), "uia_center_y": data.get("uia_center_y") } return {} except Exception: return {} @staticmethod async def parse_first_level_image_url(image_url: str, device_info: dict = None) -> list: if device_info is None: logger.warning("未提供动态设备信息,使用通用回退配置。") device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": image_url }, }, {"type": "text", "text": json.dumps(device_info, ensure_ascii=False)}, {"type": "text", "text": XinDianTuReadImageKit._prompt}, ], }, ], ) ) content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) try: data = json.loads(raw) if isinstance(data, list): # 过滤掉 None (即被 _add_click_point 排除掉的顶部项) data = [XinDianTuReadImageKit._add_click_point(x, device_info, anchor="station_text") for x in data if x.get("b_use", 1) == 1] data = [x for x in data if x is not None] elif isinstance(data, dict): if data.get("b_use", 1) == 1: data = XinDianTuReadImageKit._add_click_point(data, device_info, anchor="station_text") data = [data] if data is not None else [] else: data = [] # Clean up temporary field b_use if it still exists if isinstance(data, list): for item in data: if "b_use" in item: del item["b_use"] elif isinstance(data, dict) and "b_use" in data: del data["b_use"] # If dict result, wrap in list for consistency if needed, but keeping as is based on original logic logic implies list return if isinstance(data, dict): return [data] if data else [] if isinstance(data, list) and data: # 移除旧的关键词过滤逻辑,完全信任视觉模型的识别结果 # 如果未来发现模型识别了太多广告,可以在 Prompt 中强化 "严禁识别广告" 的约束 filtered = [] for item in data: if not isinstance(item, dict): continue name = item.get("station_name") if not name: continue filtered.append(item) data = filtered if isinstance(data, list) and data: tasks = [] idxs = [] for i, item in enumerate(data): if not isinstance(item, dict): continue if item.get("uia_center_x") is None or item.get("uia_center_y") is None: name = item.get("station_name") if name: idxs.append(i) tasks.append(asyncio.create_task( XinDianTuReadImageKit.find_station_coordinate_first_page(image_url, name, device_info) )) if tasks: results = await asyncio.gather(*tasks, return_exceptions=True) for i, r in zip(idxs, results): if isinstance(r, dict) and r.get("uia_center_x") is not None and r.get("uia_center_y") is not None: data[i]["uia_center_x"] = r.get("uia_center_x") data[i]["uia_center_y"] = r.get("uia_center_y") for item in data: if isinstance(item, dict): item["uia_center_x"] = 100 item.setdefault("uia_center_y", None) return data except Exception as e: logger.error(f"Error parsing JSON: {e}") logger.error(f"Raw content: {raw}") return [] @staticmethod async def parse_hybrid_image(image_path, uploader, cdn_domain): """ 统一入口:混合识别模式 """ return await XinDianTuReadImageKit.get_stations_hybrid(image_path, uploader, cdn_domain) @staticmethod async def parse_vl_image(vl_image_url, json_metadata, device_info=None): """ 基于 _vl.jpg (带绿框) 和 JSON 元数据进行识别 """ if not json_metadata or "cards" not in json_metadata: return [] cards_meta = json_metadata["cards"] # 按 id 或 Y 坐标排序,确保顺序一致 (Kit 生成时已经是 top-down) cards_meta.sort(key=lambda x: x["rect"][1]) prompt = ( "图片中用绿色矩形框标记了若干个充电站卡片区域。\n" "请按从上到下的顺序,依次识别每个绿色框内的场站信息,并返回一个JSON数组。\n" "数组中元素的顺序必须与图片中绿色框从上到下的顺序严格一致。\n" "如果某个框内不是有效的场站卡片(例如是广告),请返回null或空对象,不要跳过顺序。\n" "\n" "每个JSON对象包含以下字段:\n" "1. station_name: 场站名称;\n" "2. price: 价格(数字);\n" "3. pro_price: Pro会员价(数字,无则null);\n" "4. piles: 充电枪描述字符串(例如 '快 闲4/4');\n" "5. tags: 标签列表(如 ['限时免费']);\n" "6. parking: 停车费用描述(通常在'P'图标后,例如 '收费停车:以场站实际收费规则为准' 或 '限时免费停车...')。\n" "7. distance: 距离信息字符串(例如 '5.3km')。\n" "\n" "严格返回纯JSON格式。" ) loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": vl_image_url}}, {"type": "text", "text": prompt}, ], }, ], ) ) content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) final_stations = [] try: llm_data = json.loads(raw) if isinstance(llm_data, list): # 尝试与 cards_meta 对齐 count = min(len(llm_data), len(cards_meta)) for i in range(count): item = llm_data[i] meta = cards_meta[i] if not item or not isinstance(item, dict): continue name = item.get("station_name") if not name: continue # 注入元数据中的点击坐标 click_pt = meta.get("click_point", [0, 0]) cx, cy = click_pt # 安全检查 img_h = 2400 if device_info: img_h = float(device_info.get("displayHeight") or device_info.get("height") or 2400) else: img_h = json_metadata.get("height", 2400) # 顶部安全排除 if cy < (img_h * SAFE_EXCLUDE_RATIO): logger.warning(f"VL安全排除: 坐标 ({cx}, {cy}) 位于屏幕顶部 {int(SAFE_EXCLUDE_RATIO*100)}% 区域,已忽略。") continue # 底部安全排除 if cy > (img_h * (1 - BOTTOM_SAFE_EXCLUDE_RATIO)): logger.warning(f"VL安全排除: 坐标 ({cx}, {cy}) 位于屏幕底部 {int(BOTTOM_SAFE_EXCLUDE_RATIO*100)}% 区域,疑似底部功能区,已忽略。") continue item["uia_center_x"] = cx item["uia_center_y"] = cy item["rect"] = meta.get("rect") final_stations.append(item) else: logger.warning(f"LLM return format error: expected list, got {type(llm_data)}") except Exception as e: logger.error(f"Error parsing VL response: {e}") return final_stations @staticmethod async def get_stations_hybrid(image_path, uploader, cdn_domain): """ 混合识别模式:图形学切片 + 大模型小图 OCR """ # 1. 图形学切片 segments = XinDianTuReadImageKit.get_card_segments(image_path) if not segments: return [] # 2. 对每个切片并行进行 OCR tasks = [] for seg in segments: # 上传切片 patch_path = seg["patch_path"] remote_path = f"tmp/patches/{os.path.basename(patch_path)}" uploader.upload_file(patch_path, remote_path) patch_url = f"{cdn_domain}/{remote_path}" # 记录上传后的 URL 供识别使用 seg["patch_url"] = patch_url tasks.append(XinDianTuReadImageKit.recognize_card_text(patch_url)) # 等待所有 OCR 完成 results = await asyncio.gather(*tasks) # 3. 组装结果 final_stations = [] for i, res in enumerate(results): name = res.get("station_name") if name and name != "未知": seg = segments[i] res["x"] = seg["center_x"] res["y"] = seg["center_y"] final_stations.append(res) # 4. 后置过滤 (复用原有过滤逻辑) if final_stations: processed_excluded_titles = {str(x).replace(" ", "").strip() for x in STATION_EXCLUDED_TITLES} filtered = [] for item in final_stations: name = item.get("station_name") normalized_name = str(name).replace(" ", "").strip() if normalized_name in processed_excluded_titles: continue if any(kw in normalized_name for kw in STATION_BLACKLIST_KEYWORDS): continue filtered.append(item) final_stations = filtered return final_stations @staticmethod @staticmethod def get_card_segments(image_path, output_dir="./Debug/Patches"): """ 基于水平灰色分割带 (Divider Band) 切取场站卡片 返回: list of dict {"patch_path": str, "center_x": int, "center_y": int, "y_range": tuple} """ if not os.path.exists(output_dir): os.makedirs(output_dir) try: img = Image.open(image_path).convert('RGB') width, height = img.size img_data = np.array(img) logger.info(f"页面分析 - 分辨率: {width}x{height}") # 1. 行特征分析:识别分割线 (Divider) # 分割线特征:横贯全屏,亮度均匀,通常比纯白(255)稍暗,比文字内容亮 # 典型值:Mean=242-247, Std<10 row_types = [] # 0: Unknown/Content, 1: Divider, 2: White/Empty # 采样点:左中右 l_x, m_x, r_x = int(width * 0.05), int(width * 0.5), int(width * 0.95) debug_rows = [] for y in range(height): # 避开顶部和底部导航栏 (15% - 85%) - 扩大排除范围以避免误识别 Header/Footer if y < height * 0.15 or y > height * 0.85: row_types.append(1) # 视为无关区域 (标记为 Divider 以防止形成 Segment) continue row = img_data[y] row_mean = np.mean(row) row_std = np.std(row) # 判定逻辑: # 1. 纯白行 (卡片底色) -> mean > 252 (允许微小噪点) # 2. 分割线 (Divider) -> 230 < mean < 252 且 std < 15 (颜色均匀) # 3. 内容行 (Content) -> mean <= 230 或 (mean > 230 且 std >= 15) (有文字/图片导致方差大) if row_mean > 252: r_type = 2 # White/Empty elif 230 < row_mean <= 252 and row_std < 15: r_type = 1 # Divider else: r_type = 0 # Content row_types.append(r_type) # Debug log sampling if y % 50 == 0: debug_rows.append(f"Row {y}: Mean={row_mean:.1f}, Std={row_std:.1f} -> Type={r_type}") if debug_rows: logger.info("行特征采样 (调试用):\n" + "\n".join(debug_rows)) # 2. 聚合连续的 Content 区域 (Type 0 or Type 2 sandwiched by Type 0) # 实际上,卡片是由 Divider 分隔开的区域。 # 我们寻找两个 Divider 之间的区域,且该区域必须包含 Content (Type 0)。 segments = [] # 简化状态机: # 寻找非 Divider 的连续段 in_segment = False seg_start = -1 has_content = False raw_blocks = [] for y, r_type in enumerate(row_types): if r_type != 1: # Not Divider if not in_segment: in_segment = True seg_start = y has_content = False if r_type == 0: has_content = True else: # Is Divider if in_segment: # 结束一段 if has_content: # 只有包含内容的段才算 raw_blocks.append((seg_start, y)) in_segment = False # 处理最后一段 if in_segment and has_content: raw_blocks.append((seg_start, len(row_types))) # 3. 过滤和后处理 # 合并距离很近的块?或者过滤太小的块 base_name = os.path.splitext(os.path.basename(image_path))[0] valid_segments = [] logger.info(f"Initial raw blocks count: {len(raw_blocks)}") for i, (y1, y2) in enumerate(raw_blocks): h = y2 - y1 # 过滤太小的块 (可能是杂噪或单纯的文字行) if h < 50: # logger.debug(f"Block {i} too small: {h}") continue # 过滤太大的块 (可能是全屏错误) if h > 1000: continue # 再次确认内部是否有足够的“暗像素”(内容) # 避免切出纯白的空隙 region = img_data[y1:y2] region_mean = np.mean(region) if region_mean > 254.5: # 整体太白 (放宽阈值,避免误杀大面积白色的卡片) logger.info(f" [-] 忽略区域 {i}: Y({y1}-{y2}), H={h}, 整体太白 (Mean={region_mean:.1f} > 254.5)") continue # --- 优化:边缘背景检查与宽度裁剪 (Margin Check & Crop) --- # 不再直接拒绝白边,而是尝试计算内容的有效宽度 # 假设:有效内容行的方差较高,或者亮度显著不同于背景 # 简单策略:保留全宽,但增加高度限制 if h < 60: # 稍微降低阈值以测试 logger.info(f" [-] 忽略区域 {i}: Y({y1}-{y2}), H={h}, 高度不足 (<60)") continue # 计算左右边距的平均亮度,辅助判断(仅记录日志,不强行过滤) margin_w = max(5, int(width * 0.05)) l_margin = img_data[y1:y2, 0:margin_w] r_margin = img_data[y1:y2, width-margin_w:width] l_mean = np.mean(l_margin) r_mean = np.mean(r_margin) # 如果是全宽卡片,左右边缘可能是白色的。 # 之前的逻辑: if l_mean > 252 or r_mean > 252: continue (导致漏检) # 现在移除该逻辑。 # --- 新增:自动裁剪水平宽度 (Auto Horizontal Crop) --- # 尝试找到内容的左右边界 (基于列的方差或亮度差异) x1, x2 = 0, width # 从左向右扫描 for x in range(0, int(width * 0.4), 2): col = img_data[y1:y2, x] # 如果这一列不是纯色背景 (std > 5) 或者明显比背景暗 (mean < 245) if np.std(col) > 5 or np.mean(col) < 245: x1 = x break # 从右向左扫描 for x in range(width - 1, int(width * 0.6), -2): col = img_data[y1:y2, x] if np.std(col) > 5 or np.mean(col) < 245: x2 = x + 1 break # 增加一点 padding x1 = max(0, x1 - 10) x2 = min(width, x2 + 10) # 如果裁剪后宽度太小,可能不是有效卡片 if (x2 - x1) < width * 0.5: logger.info(f" [-] 忽略区域 {i}: 裁剪后宽度过小 ({x2-x1})") continue # 保存 patch = img.crop((x1, y1, x2, y2)) patch_name = f"{base_name}_p{i}_{y1}.jpg" patch_path = os.path.join(output_dir, patch_name) patch.save(patch_path) center_y = (y1 + y2) // 2 valid_segments.append({ "patch_path": os.path.abspath(patch_path), "center_x": (x1 + x2) // 2, "center_y": center_y, "y_range": (y1, y2) }) logger.info(f" [+] 发现卡片 {i}: Y({y1}-{y2}), H={h}, Crop X({x1}-{x2}), 边缘(L={l_mean:.1f}, R={r_mean:.1f}), 已保存") logger.info(f"分析完成:识别到 {len(valid_segments)} 个区域") return valid_segments except Exception as e: logger.error(f"图形学切片失败: {e}", exc_info=True) return [] @staticmethod async def recognize_card_text(patch_url): """ 对切片进行小图 OCR 识别,提取场站详细信息 """ loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ { "type": "text", "text": "请识别图片中的充电站信息,并以 JSON 格式输出:{\"station_name\": \"...\", \"price\": \"...\", \"piles\": \"空闲数/总数\"}。只输出 JSON,不要有其他文字。" }, {"type": "image_url", "image_url": {"url": patch_url}} ] } ], max_tokens=200 ) ) content = resp.choices[0].message.content.strip() # 尝试解析 JSON try: import json import re json_match = re.search(r'\{.*\}', content, re.DOTALL) if json_match: return json.loads(json_match.group()) except Exception: logger.warning(f"OCR 结果解析 JSON 失败: {content}") return {"station_name": "未知"} @staticmethod async def detect_ad(image_url: str, device_info: dict = None) -> dict: """ 检测图片中是否存在广告弹窗,并定位关闭按钮坐标 参数: image_url: 图片地址 device_info: 设备信息 返回: { "has_ad": bool, "uia_center_x": int|None, "uia_center_y": int|None } """ if device_info is None: logger.warning("未提供动态设备信息,使用通用回退配置。") device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO prompt = ( "分析图片中是否存在覆盖在主界面上的广告弹窗(Popup Ad)。" "如果存在,请找到关闭该弹窗的按钮(通常是一个带有 'X' 的图标,可能在弹窗的右上角、右下角或正下方)。" "仅输出JSON对象(不含任何说明文字),包含以下字段:" "1. has_ad: 布尔值,是否存在广告弹窗;" "2. close_button_bounds: 关闭按钮的像素坐标或归一化坐标(0-1000) {x1,y1,x2,y2}。如果不存在广告则为 null。" "注意:只需识别最明显的那个关闭按钮。严格返回纯JSON。" ) loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME_AD, # 使用更强的视觉模型 messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, {"type": "text", "text": json.dumps(device_info, ensure_ascii=False)}, {"type": "text", "text": prompt}, ], }, ], ) ) content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) try: data = json.loads(raw) has_ad = data.get("has_ad", False) if has_ad and data.get("close_button_bounds"): # 使用现有的辅助方法转换坐标 temp_obj = {"bounds": data.get("close_button_bounds")} XinDianTuReadImageKit._add_click_point(temp_obj, device_info) return { "has_ad": True, "uia_center_x": temp_obj.get("uia_center_x"), "uia_center_y": temp_obj.get("uia_center_y") } return {"has_ad": False, "uia_center_x": None, "uia_center_y": None} except Exception as e: logger.error(f"Error parsing ad detection JSON: {e}") return {"has_ad": False, "uia_center_x": None, "uia_center_y": None} @staticmethod async def _download_as_base64(url: str) -> str: """Helper to download image and convert to base64 for VL model fallback""" try: async with aiohttp.ClientSession() as session: async with session.get(url) as resp: if resp.status != 200: return None content = await resp.read() if not content: return None return base64.b64encode(content).decode('utf-8') except Exception as e: logger.error(f"Failed to download image for base64 conversion: {e}") return None @staticmethod async def parse_price_schedule(station_name: str, image_url: str, device_info: dict = None) -> list: """ 解析价格时段表(整图),提取每一行的时间区间与费用信息 参数: image_url: 图片地址(包含“当前时段电费/全部时段电费”弹窗或列表) device_info: 设备信息,用于提升视觉理解一致性(可选) 返回: 列表,每个元素为: { "start": "HH:MM", 开始时间 "end": "HH:MM", 结束时间 "price_kwh": float|None, 总价或站点价(元/度) "electric_fee_kwh": float|None, 电费(元/度) "service_fee_kwh": float|None 服务费(元/度) } 说明: - 使用 qwen3-vl-flash 进行视觉解析,提示词约束输出为纯 JSON 数组 - 若某项缺失则返回 None(保持结构统一) """ if device_info is None: logger.warning("未提供动态设备信息,使用通用回退配置。") device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO # 视觉解析提示词:约束输出字段与格式,避免模型输出说明文字 prompt = ( "仅输出JSON数组(不含任何说明文字)。识别图片中所有时段的价格信息,返回每一行:" "1) start: 开始时间(HH:MM),2) end: 结束时间(HH:MM)," "3) price_kwh: 价格(元/度,站点价或总价)," "4) electric_fee_kwh: 电费(元/度)," "5) service_fee_kwh: 服务费(元/度)。" "所有数值以数字返回,例如 1.1800。若缺失某项则填 null。严格返回纯JSON数组。" "注意:如果某行价格信息为空或表示同上,请尝试复用上一行的价格信息。" ) loop = asyncio.get_event_loop() def _do_request(url_val): return XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": url_val}, }, {"type": "text", "text": json.dumps(device_info, ensure_ascii=False)}, {"type": "text", "text": prompt}, ], }, ], ) try: # 在线程池中同步调用 OpenAI 兼容接口,避免阻塞事件循环 resp = await loop.run_in_executor(None, lambda: _do_request(image_url)) except BadRequestError as e: # 尝试捕获 DataInspection 错误并进行 Base64 回退 err_code = getattr(e, 'code', '') or '' if not err_code and hasattr(e, 'body') and isinstance(e.body, dict): err_code = e.body.get('code', '') # 如果是媒体格式或数据检查错误 if 'InvalidParameter.DataInspection' in str(err_code) or 'media format' in str(e).lower(): logger.warning(f"Image URL rejected ({err_code}). Attempting Base64 fallback: {image_url}") base64_str = await XinDianTuReadImageKit._download_as_base64(image_url) if base64_str: # 简单推断格式,默认 jpeg ext = "jpeg" lower_url = image_url.lower() if ".png" in lower_url: ext = "png" elif ".webp" in lower_url: ext = "webp" elif ".jpg" in lower_url or ".jpeg" in lower_url: ext = "jpeg" data_uri = f"data:image/{ext};base64,{base64_str}" resp = await loop.run_in_executor(None, lambda: _do_request(data_uri)) else: logger.error("Base64 download failed during fallback.") return [] else: logger.error(f"API BadRequestError: {e}") return [] except Exception as e: logger.error(f"API Unexpected Error: {e}") return [] content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) try: # 反序列化模型返回的 JSON rows = json.loads(raw) if not isinstance(rows, list): return [] norm = [] for r in rows: if not isinstance(r, dict): continue # 兼容不同来源字段命名,统一为目标键 start = r.get("start") end = r.get("end") price = r.get("price_kwh") elec = r.get("electric_fee_kwh") serv = r.get("service_fee_kwh") # station_name = station_name # 将字符串数字安全转换为 float;缺失则为 None norm.append({ "station_name": station_name, "start": start, "end": end, "price_kwh": float(price) if isinstance(price, (int, float, str)) and str(price) else None, "electric_fee_kwh": float(elec) if isinstance(elec, (int, float, str)) and str(elec) else None, "service_fee_kwh": float(serv) if isinstance(serv, (int, float, str)) and str(serv) else None, }) for i in range(1, len(norm)): curr = norm[i] prev = norm[i - 1] for k in ("price_kwh", "electric_fee_kwh", "service_fee_kwh"): if curr.get(k) is None and prev.get(k) is not None: curr[k] = prev[k] return norm except Exception as e: logger.error(f"Error parsing JSON: {e}") logger.error(f"Raw content: {raw}") return [] @staticmethod def _to_minutes(t: str) -> int: """ 将 "HH:MM" 转为分钟数(0-1440) 说明: - 特殊处理 "24:00" -> 1440,方便区间闭合处理 - 非法格式返回 0 """ if not t: return 0 try: h, m = t.split(":") h = int(h) m = int(m) if h == 24 and m == 0: return 24 * 60 return h * 60 + m except Exception: return 0 @staticmethod def _fmt(t: int) -> str: """ 将分钟数格式化为 "HH:MM" """ h = t // 60 m = t % 60 return f"{h:02d}:{m:02d}" @staticmethod def expand_schedule_to_hourly(rows: list) -> list: """ 将时段列表按小时边界拆分 参数: rows: parse_price_schedule 返回的时段列表 返回: 每小时一条数据的列表,区间为闭开 [start, end) 的连续小时段 说明: - 例如 05:00-08:00 -> 05:00-06:00, 06:00-07:00, 07:00-08:00 - 保留每小时的价格、电费、服务费不变 """ hourly = [] for r in rows: # 起止时间转分钟 s = XinDianTuReadImageKit._to_minutes(r.get("start")) e = XinDianTuReadImageKit._to_minutes(r.get("end")) if e <= s: continue cur = s while cur < e: # 下一小时边界;不超过区间终点 nxt = min(e, ((cur // 60) + 1) * 60) hourly.append({ "start": XinDianTuReadImageKit._fmt(cur), "end": XinDianTuReadImageKit._fmt(nxt), "price_kwh": r.get("price_kwh"), "electric_fee_kwh": r.get("electric_fee_kwh"), "service_fee_kwh": r.get("service_fee_kwh"), }) cur = nxt return hourly @staticmethod async def _fetch_md5(url: str) -> str: """ 下载图片并计算 MD5(用于内容去重) 返回: 32位十六进制 MD5 字符串;失败返回空字符串 """ try: async with aiohttp.ClientSession() as session: async with session.get(url) as resp: if resp.status != 200: return "" content = await resp.read() return hashlib.md5(content).hexdigest() except Exception: return "" @staticmethod async def parse_address(station_name: str, image_url: str, device_info: dict = None) -> dict: """ 解析图片中的充电站地址信息,同时识别“全部时段”按钮坐标 参数: station_name:场站名称 image_url: 图片地址 device_info: 设备信息(可选) 返回: 字典,包含 address 字段以及 uia_center_x/uia_center_y (如果找到按钮) """ if device_info is None: logger.warning("未提供动态设备信息,使用通用回退配置。") device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO # 启动寻找“全部时段”按钮的任务,为后续可能的点击做准备 button_task = asyncio.create_task(XinDianTuReadImageKit.find_all_time_button_coordinate(image_url, device_info)) prompt = ( "仅输出JSON对象(不含任何说明文字)。" "任务1:识别图片中充电站的完整名称(full_station_name)。" f"提示:列表中看到的名称可能是截断的(例如“{station_name}”),请在图片上方找到最匹配的完整名称。" "任务2:识别充电站的详细地址(address)。" "寻找规则:地址通常紧跟在场站名称下方,或者在‘距离’图标(定位小图表)附近,或者在带有‘导航’按钮的同一行。" "返回包含 full_station_name 和 address 字段的JSON对象,例如 {\"full_station_name\": \"完整名称\", \"address\": \"详细地址\"}。" "如果找不到,对应字段返回空字符串。" "严格返回纯JSON格式。" ) loop = asyncio.get_event_loop() resp = await loop.run_in_executor( None, lambda: XinDianTuReadImageKit._client.chat.completions.create( model=VL_MODEL_NAME, messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, {"type": "text", "text": json.dumps(device_info, ensure_ascii=False)}, {"type": "text", "text": prompt}, ], }, ], ) ) content = resp.choices[0].message.content or "" raw = XinDianTuReadImageKit._extract_json(content) result = {} try: data = json.loads(raw) if isinstance(data, dict): result = data except Exception as e: logger.error(f"Error parsing address JSON: {e}") # 等待并合并按钮坐标结果 try: button_result = await button_task if button_result: result.update(button_result) except Exception as e: logger.error(f"Error in button coordinate task: {e}") return result @staticmethod async def parse_price_schedule_multi(station_name: str, image_urls: list, device_info: dict = None) -> list: """ 多图解析(按图片内容 MD5 去重)并合并时段结果 参数: image_urls: 多张价格表图片的 URL 列表 device_info: 设备信息(可选) 逻辑: 1. 逐张下载并计算 MD5,去重得到唯一图片集合 2. 对每张唯一图片解析价格时段 3. 将所有图片的时段行合并为一个列表返回 返回: 合并后的时段列表(未按小时拆分) """ if not image_urls: return [] # MD5 -> URL 的映射,用于去重 md5_to_url = {} for u in image_urls: m = await XinDianTuReadImageKit._fetch_md5(u) if m and m not in md5_to_url: md5_to_url[m] = u unique_urls = list(md5_to_url.values()) # 合并时段结果 combined = [] for u in unique_urls: rows = await XinDianTuReadImageKit.parse_price_schedule(station_name, u, device_info=device_info) if rows: combined.extend(rows) return combined @staticmethod def hourly_full_day(rows: list) -> list: """ 将时段列表规整为全天24个整点小时段 参数: rows: 原始时段列表(可来自多图合并) 返回: 固定24条记录(00:00-01:00 到 23:00-24:00), 若某小时未被任何时段覆盖,则费用为 None 说明: - 选择覆盖该小时段的时段(若多个,则选择重叠时间最长的一个) - 保证返回结构完整,便于后续消费端显示或补全 """ # 预处理:转换为分钟区间 intervals = [] for r in rows: s = XinDianTuReadImageKit._to_minutes(r.get("start")) e = XinDianTuReadImageKit._to_minutes(r.get("end")) if e <= s: continue s = max(0, s) e = min(1440, e) intervals.append({ "s": s, "e": e, "price_kwh": r.get("price_kwh"), "electric_fee_kwh": r.get("electric_fee_kwh"), "service_fee_kwh": r.get("service_fee_kwh"), }) intervals.sort(key=lambda x: (x["s"], x["e"])) result = [] for h in range(24): hs = h * 60 he = (h + 1) * 60 best = None best_overlap = 0 for it in intervals: overlap = max(0, min(he, it["e"]) - max(hs, it["s"])) if overlap > best_overlap: best_overlap = overlap best = it result.append({ "start": XinDianTuReadImageKit._fmt(hs), "end": XinDianTuReadImageKit._fmt(he), "price_kwh": best["price_kwh"] if best else None, "electric_fee_kwh": best["electric_fee_kwh"] if best else None, "service_fee_kwh": best["service_fee_kwh"] if best else None, }) return result async def test1(): # 测试新电途第一层结构的读取 url = "https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/Temp/8fd79c68-fec6-4ca7-8d8e-fbff3c6862c8.jpg" print(f"Testing First Level with URL: {url}") result = await XinDianTuReadImageKit.parse_first_level_image_url(url) print("First Level Result:") print(json.dumps(result, ensure_ascii=False, indent=2)) async def test2(station_name: str): # 测试新电途第二层结构的读取 url = "https://dsideal.obs.myhuaweicloud.com/HuangHai/Temp/SecondPage.jpg" print(f"Testing address extraction from: {url}") result = await XinDianTuReadImageKit.parse_address(station_name, url) result["station_name"] = station_name print("Address result:") print(json.dumps(result, ensure_ascii=False, indent=2)) async def test3(station_name: str): # 测试新电途价格表图片的解析 samples = [ "https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/Temp/XinDianTu/1.jpg", "https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/Temp/XinDianTu/2.jpg", ] rows = await XinDianTuReadImageKit.parse_price_schedule_multi(station_name, samples) hourly = XinDianTuReadImageKit.hourly_full_day(rows) print(json.dumps(hourly, ensure_ascii=False, indent=2)) if __name__ == "__main__": #asyncio.run(test1()) station_name = '吉林省看守所充电站' #asyncio.run(test2(station_name)) asyncio.run(test3(station_name))