Files
aiData/Apps/XinDianTu/XinDianTuReadImageKit.py
HuangHai ca23ebf606 'commit'
2026-01-12 08:09:32 +08:00

1291 lines
56 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
from PIL import Image
import os
import asyncio
import hashlib
import json
import aiohttp
import logging
import base64
from openai import OpenAI, BadRequestError
from Config.Config import (
ALY_LLM_API_KEY, VL_MODEL_NAME, VL_MODEL_NAME_AD,
SAFE_EXCLUDE_RATIO, FALLBACK_WIDTH, FALLBACK_HEIGHT,
BOTTOM_SAFE_EXCLUDE_RATIO
)
logger = logging.getLogger(__name__)
class XinDianTuReadImageKit:
_client = OpenAI(
api_key=ALY_LLM_API_KEY,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
# 通用回退设备信息,仅在无法动态获取设备信息时使用
_FALLBACK_DEVICE_INFO = {
"displayWidth": FALLBACK_WIDTH,
"displayHeight": FALLBACK_HEIGHT,
"productName": "generic"
}
_prompt = (
"仅输出JSON数组不含任何说明文字按从左到右、从上到下的顺序识别图片中的充电站区域。识别规则如下\n"
"1. 必须是卡片形式的充电站信息区域。\n"
"2. 每一个卡片必须同时具备以下所有要素,否则严禁识别:\n"
" - 场站名称 (station_name)\n"
" - 距离信息 (distance, 例如 '5.3km'),位于卡片右上角;\n"
" - 金额/电费 (price例如 '0.8490')\n"
" - 充电枪信息 (piles包含''''''的类型、总枪数和空闲枪数,例如 '快 闲4/4')。\n"
"3. 如果缺少上述任何一项要素(例如只有名称和距离,但没有电费或枪数信息),说明它不是真正的场站卡片(可能是广告或功能入口),请直接跳过。\n"
"\n"
"JSON对象字段要求\n"
"1. b_use: 状态标识1或0。如果场站名称为灰色或带有“暂停使用”等标签则为0否则为1。\n"
"2. station_name: 场站名称;\n"
"3. price: 一度电的价格(数字);\n"
"4. pro_price: Pro会员价格数字无则为null\n"
"5. piles: 充电枪列表 [{type: '', free: 4, total: 4}]\n"
"6. parking: 停车费用描述(通常在'P'图标后,例如 '收费停车:以场站实际收费规则为准''限时免费停车...')。\n"
"7. distance: 距离信息字符串(例如 '5.3km')。\n"
"8. bounds: {x1,y1,x2,y2} 区域像素坐标0-1000\n"
"9. bounds_norm: {left,top,right,bottom} 归一化坐标(0-1)\n"
"10. station_name_bounds: 场站名称文字区域坐标 {x1,y1,x2,y2}0-1000\n"
"11. station_name_bounds_norm: 场站名称文字归一化坐标(0-1)。\n"
"\n"
"重要约束(违反者不予识别):\n"
"A. 严禁识别广告位和筛选标签。如“夜间免停”、“洗手间”、“不限车长”、“不限车高”、“组团”、“综合排序”等均不是场站。\n"
"B. 真正的场站卡片必须是一个横跨屏幕的大卡片包含场站名称大号加粗、金额¥开头、距离km结尾、充电枪状态闲x/x\n"
"C. 严禁将屏幕中间的筛选标签误认为场站卡片。\n"
"\n"
"严格返回纯JSON格式。"
)
@staticmethod
def _extract_json(text: str) -> str:
if not text:
return "[]"
cleaned = text.strip()
if "```" in cleaned:
lines = []
for line in cleaned.splitlines():
if line.strip().startswith("```"):
continue
lines.append(line)
cleaned = "\n".join(lines).strip()
decoder = json.JSONDecoder()
pos = 0
while pos < len(cleaned):
idx_dict = cleaned.find("{", pos)
idx_list = cleaned.find("[", pos)
candidates = [i for i in (idx_dict, idx_list) if i != -1]
if not candidates:
break
start = min(candidates)
snippet = cleaned[start:]
try:
_, end = decoder.raw_decode(snippet)
return snippet[:end]
except json.JSONDecodeError:
pos = start + 1
continue
return "[]"
@staticmethod
def _add_center(obj, device_info):
return XinDianTuReadImageKit._add_click_point(obj, device_info, anchor="center")
@staticmethod
def _add_click_point(obj, device_info, anchor: str = "center"):
# 获取显示宽高,增加容错
display_width = float(device_info.get("displayWidth") or device_info.get("width") or 1080)
display_height = float(device_info.get("displayHeight") or device_info.get("height") or 2400)
bounds = obj.get("bounds")
bn = obj.get("bounds_norm")
text_bounds = obj.get("station_name_bounds")
text_bn = obj.get("station_name_bounds_norm")
uia_x = None
uia_y = None
def get_pixel_coords(b_data, d_w, d_h):
if not b_data:
return None
# Extract values based on dict or list
if isinstance(b_data, list) and len(b_data) == 4:
v1, v2, v3, v4 = b_data
elif isinstance(b_data, dict):
# 兼容多种可能的键名
v1 = b_data.get("left") if b_data.get("left") is not None else b_data.get("x1")
v2 = b_data.get("top") if b_data.get("top") is not None else b_data.get("y1")
v3 = b_data.get("right") if b_data.get("right") is not None else b_data.get("x2")
v4 = b_data.get("bottom") if b_data.get("bottom") is not None else b_data.get("y2")
if any(v is None for v in (v1, v2, v3, v4)):
return None
else:
return None
try:
v1, v2, v3, v4 = float(v1), float(v2), float(v3), float(v4)
except (ValueError, TypeError):
return None
max_v = max(v1, v2, v3, v4)
# 1. 0-1 归一化坐标
if max_v <= 1.05:
x1, y1, x2, y2 = v1 * d_w, v2 * d_h, v3 * d_w, v4 * d_h
# 2. 0-1000 归一化坐标 (Qwen-VL 常用)
elif max_v <= 1005:
x1, y1, x2, y2 = (v1 / 1000.0) * d_w, (v2 / 1000.0) * d_h, (v3 / 1000.0) * d_w, (v4 / 1000.0) * d_h
# 3. 绝对像素坐标
else:
x1, y1, x2, y2 = v1, v2, v3, v4
return min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)
# 优先使用场站名称区域进行点击 (最安全)
coords = get_pixel_coords(text_bn, display_width, display_height) or get_pixel_coords(text_bounds, display_width, display_height)
if coords:
x1, y1, x2, y2 = coords
uia_x = int(x1 + (x2 - x1) / 2)
uia_y = int(y1 + (y2 - y1) / 2)
logger.info(f"坐标计算: 使用文字区域 -> ({uia_x}, {uia_y}) | 区域: {coords} | 屏幕: {display_width}x{display_height}")
# 备选:使用整个卡片区域
if uia_x is None:
coords = get_pixel_coords(bn, display_width, display_height) or get_pixel_coords(bounds, display_width, display_height)
if coords:
x1, y1, x2, y2 = coords
w, h = x2 - x1, y2 - y1
if anchor == "top_left":
uia_x = int(x1 + max(5.0, w * 0.15))
uia_y = int(y1 + max(5.0, h * 0.20))
else:
# 默认中心点,但稍微偏上一点,避开底部可能的按钮
uia_x = int(x1 + w / 2)
uia_y = int(y1 + h * 0.4)
logger.info(f"坐标计算: 使用卡片区域 -> ({uia_x}, {uia_y}) | 区域: {coords} | 屏幕: {display_width}x{display_height}")
# --- 安全过滤:过滤掉屏幕顶部的点击坐标 (通常是广告或菜单) ---
if uia_y is not None:
# 如果点击点在屏幕顶部 SAFE_EXCLUDE_RATIO 范围内,极大概率是误触广告位,将其排除
if uia_y < (display_height * SAFE_EXCLUDE_RATIO):
logger.warning(f"安全排除: 坐标 ({uia_x}, {uia_y}) 位于屏幕顶部 {int(SAFE_EXCLUDE_RATIO*100)}% 区域,疑似广告或菜单,已忽略。")
return None
# 如果点击点在屏幕底部 BOTTOM_SAFE_EXCLUDE_RATIO 范围内,极大概率是误触底部功能区(如扫码充电),将其排除
if uia_y > (display_height * (1 - BOTTOM_SAFE_EXCLUDE_RATIO)):
logger.warning(f"安全排除: 坐标 ({uia_x}, {uia_y}) 位于屏幕底部 {int(BOTTOM_SAFE_EXCLUDE_RATIO*100)}% 区域,疑似底部功能区,已忽略。")
return None
if uia_x is not None and uia_y is not None:
obj["uia_center_x"] = uia_x
obj["uia_center_y"] = uia_y
# Clean up temporary fields
for k in ["bounds", "bounds_norm", "station_name_bounds", "station_name_bounds_norm"]:
if k in obj:
del obj[k]
return obj
@staticmethod
async def find_all_time_button_coordinate(image_url: str, device_info: dict = None) -> dict:
if device_info is None:
logger.warning("未提供动态设备信息,使用通用回退配置。")
device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO
prompt = (
"仅输出JSON对象不含任何说明文字请找到图片中带有“全部时段”字样的按钮区域通常在价格表下方是一个带有右箭头的文字按钮\n"
"返回格式示例:\n"
"{\n"
' "bounds": {"x1": 100, "y1": 200, "x2": 300, "y2": 400}, \n'
' "bounds_norm": {"left": 0.1, "top": 0.2, "right": 0.3, "bottom": 0.4}\n'
"}\n"
"注意bounds应使用0-1000的归一化坐标空间。\n"
"如果未找到返回空JSON {}"
)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
{"type": "text", "text": json.dumps(device_info, ensure_ascii=False)},
{"type": "text", "text": prompt},
],
},
],
)
)
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
try:
data = json.loads(raw)
if isinstance(data, dict) and (data.get("bounds") or data.get("bounds_norm")):
data = XinDianTuReadImageKit._add_center(data, device_info)
# 只返回中心坐标
return {
"uia_center_x": data.get("uia_center_x"),
"uia_center_y": data.get("uia_center_y")
}
return {}
except Exception as e:
logger.error(f"Error parsing JSON: {e}")
logger.error(f"Raw content: {raw}")
return {}
@staticmethod
async def find_station_coordinate_first_page(image_url: str, station_name: str, device_info: dict = None) -> dict:
if device_info is None:
logger.warning("未提供动态设备信息,使用通用回退配置。")
device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO
prompt = (
"仅输出JSON对象不含任何说明文字"
"请在图片中找到场站名称完全匹配“" + str(station_name) + "”的那一行/卡片,"
"返回该场站名称文字本身的区域坐标(优先返回归一化坐标)。"
"返回格式示例:"
"{\"station_name_bounds\": {\"x1\": 100, \"y1\": 200, \"x2\": 300, \"y2\": 240}, "
"\"station_name_bounds_norm\": {\"left\": 0.1, \"top\": 0.2, \"right\": 0.3, \"bottom\": 0.24}}"
"注意bounds使用0-1000归一化坐标空间如果找不到返回空JSON {}"
)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": json.dumps(device_info, ensure_ascii=False)},
{"type": "text", "text": prompt},
],
},
],
)
)
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
try:
data = json.loads(raw)
if isinstance(data, dict) and (data.get("station_name_bounds") or data.get("station_name_bounds_norm") or data.get("bounds") or data.get("bounds_norm")):
data = XinDianTuReadImageKit._add_click_point(data, device_info, anchor="station_text")
return {
"uia_center_x": data.get("uia_center_x"),
"uia_center_y": data.get("uia_center_y")
}
return {}
except Exception:
return {}
@staticmethod
async def parse_first_level_image_url(image_url: str, device_info: dict = None) -> list:
if device_info is None:
logger.warning("未提供动态设备信息,使用通用回退配置。")
device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
{"type": "text", "text": json.dumps(device_info, ensure_ascii=False)},
{"type": "text", "text": XinDianTuReadImageKit._prompt},
],
},
],
)
)
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
try:
data = json.loads(raw)
if isinstance(data, list):
# 过滤掉 None (即被 _add_click_point 排除掉的顶部项)
data = [XinDianTuReadImageKit._add_click_point(x, device_info, anchor="station_text") for x in data if x.get("b_use", 1) == 1]
data = [x for x in data if x is not None]
elif isinstance(data, dict):
if data.get("b_use", 1) == 1:
data = XinDianTuReadImageKit._add_click_point(data, device_info, anchor="station_text")
data = [data] if data is not None else []
else:
data = []
# Clean up temporary field b_use if it still exists
if isinstance(data, list):
for item in data:
if "b_use" in item:
del item["b_use"]
elif isinstance(data, dict) and "b_use" in data:
del data["b_use"]
# If dict result, wrap in list for consistency if needed, but keeping as is based on original logic logic implies list return
if isinstance(data, dict):
return [data] if data else []
if isinstance(data, list) and data:
# 移除旧的关键词过滤逻辑,完全信任视觉模型的识别结果
# 如果未来发现模型识别了太多广告,可以在 Prompt 中强化 "严禁识别广告" 的约束
filtered = []
for item in data:
if not isinstance(item, dict):
continue
name = item.get("station_name")
if not name:
continue
filtered.append(item)
data = filtered
if isinstance(data, list) and data:
tasks = []
idxs = []
for i, item in enumerate(data):
if not isinstance(item, dict):
continue
if item.get("uia_center_x") is None or item.get("uia_center_y") is None:
name = item.get("station_name")
if name:
idxs.append(i)
tasks.append(asyncio.create_task(
XinDianTuReadImageKit.find_station_coordinate_first_page(image_url, name, device_info)
))
if tasks:
results = await asyncio.gather(*tasks, return_exceptions=True)
for i, r in zip(idxs, results):
if isinstance(r, dict) and r.get("uia_center_x") is not None and r.get("uia_center_y") is not None:
data[i]["uia_center_x"] = r.get("uia_center_x")
data[i]["uia_center_y"] = r.get("uia_center_y")
for item in data:
if isinstance(item, dict):
item["uia_center_x"] = 100
item.setdefault("uia_center_y", None)
return data
except Exception as e:
logger.error(f"Error parsing JSON: {e}")
logger.error(f"Raw content: {raw}")
return []
@staticmethod
async def parse_hybrid_image(image_path, uploader, cdn_domain):
"""
统一入口:混合识别模式
"""
return await XinDianTuReadImageKit.get_stations_hybrid(image_path, uploader, cdn_domain)
@staticmethod
async def parse_vl_image(vl_image_url, json_metadata, device_info=None):
"""
基于 _vl.jpg (带绿框) 和 JSON 元数据进行识别
"""
if not json_metadata or "cards" not in json_metadata:
return []
cards_meta = json_metadata["cards"]
# 按 id 或 Y 坐标排序,确保顺序一致 (Kit 生成时已经是 top-down)
cards_meta.sort(key=lambda x: x["rect"][1])
prompt = (
"图片中用绿色矩形框标记了若干个充电站卡片区域。\n"
"请按从上到下的顺序依次识别每个绿色框内的场站信息并返回一个JSON数组。\n"
"数组中元素的顺序必须与图片中绿色框从上到下的顺序严格一致。\n"
"如果某个框内不是有效的场站卡片例如是广告请返回null或空对象不要跳过顺序。\n"
"\n"
"每个JSON对象包含以下字段\n"
"1. station_name: 场站名称;\n"
"2. price: 价格(数字);\n"
"3. pro_price: Pro会员价数字无则null\n"
"4. piles: 充电枪描述字符串(例如 '快 闲4/4'\n"
"5. tags: 标签列表(如 ['限时免费']\n"
"6. parking: 停车费用描述(通常在'P'图标后,例如 '收费停车:以场站实际收费规则为准''限时免费停车...')。\n"
"7. distance: 距离信息字符串(例如 '5.3km')。\n"
"\n"
"严格返回纯JSON格式。"
)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": vl_image_url}},
{"type": "text", "text": prompt},
],
},
],
)
)
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
final_stations = []
try:
llm_data = json.loads(raw)
if isinstance(llm_data, list):
# 尝试与 cards_meta 对齐
count = min(len(llm_data), len(cards_meta))
for i in range(count):
item = llm_data[i]
meta = cards_meta[i]
if not item or not isinstance(item, dict):
continue
name = item.get("station_name")
if not name:
continue
# 注入元数据中的点击坐标
click_pt = meta.get("click_point", [0, 0])
cx, cy = click_pt
# 安全检查
img_h = 2400
if device_info:
img_h = float(device_info.get("displayHeight") or device_info.get("height") or 2400)
else:
img_h = json_metadata.get("height", 2400)
# 顶部安全排除
if cy < (img_h * SAFE_EXCLUDE_RATIO):
logger.warning(f"VL安全排除: 坐标 ({cx}, {cy}) 位于屏幕顶部 {int(SAFE_EXCLUDE_RATIO*100)}% 区域,已忽略。")
continue
# 底部安全排除
if cy > (img_h * (1 - BOTTOM_SAFE_EXCLUDE_RATIO)):
logger.warning(f"VL安全排除: 坐标 ({cx}, {cy}) 位于屏幕底部 {int(BOTTOM_SAFE_EXCLUDE_RATIO*100)}% 区域,疑似底部功能区,已忽略。")
continue
item["uia_center_x"] = cx
item["uia_center_y"] = cy
item["rect"] = meta.get("rect")
final_stations.append(item)
else:
logger.warning(f"LLM return format error: expected list, got {type(llm_data)}")
except Exception as e:
logger.error(f"Error parsing VL response: {e}")
return final_stations
@staticmethod
async def get_stations_hybrid(image_path, uploader, cdn_domain):
"""
混合识别模式:图形学切片 + 大模型小图 OCR
"""
# 1. 图形学切片
segments = XinDianTuReadImageKit.get_card_segments(image_path)
if not segments:
return []
# 2. 对每个切片并行进行 OCR
tasks = []
for seg in segments:
# 上传切片
patch_path = seg["patch_path"]
remote_path = f"tmp/patches/{os.path.basename(patch_path)}"
uploader.upload_file(patch_path, remote_path)
patch_url = f"{cdn_domain}/{remote_path}"
# 记录上传后的 URL 供识别使用
seg["patch_url"] = patch_url
tasks.append(XinDianTuReadImageKit.recognize_card_text(patch_url))
# 等待所有 OCR 完成
results = await asyncio.gather(*tasks)
# 3. 组装结果
final_stations = []
for i, res in enumerate(results):
name = res.get("station_name")
if name and name != "未知":
seg = segments[i]
res["x"] = seg["center_x"]
res["y"] = seg["center_y"]
final_stations.append(res)
# 4. 后置过滤 (复用原有过滤逻辑)
if final_stations:
processed_excluded_titles = {str(x).replace(" ", "").strip() for x in STATION_EXCLUDED_TITLES}
filtered = []
for item in final_stations:
name = item.get("station_name")
normalized_name = str(name).replace(" ", "").strip()
if normalized_name in processed_excluded_titles:
continue
if any(kw in normalized_name for kw in STATION_BLACKLIST_KEYWORDS):
continue
filtered.append(item)
final_stations = filtered
return final_stations
@staticmethod
@staticmethod
def get_card_segments(image_path, output_dir="./Debug/Patches"):
"""
基于水平灰色分割带 (Divider Band) 切取场站卡片
返回: list of dict {"patch_path": str, "center_x": int, "center_y": int, "y_range": tuple}
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
try:
img = Image.open(image_path).convert('RGB')
width, height = img.size
img_data = np.array(img)
logger.info(f"页面分析 - 分辨率: {width}x{height}")
# 1. 行特征分析:识别分割线 (Divider)
# 分割线特征:横贯全屏,亮度均匀,通常比纯白(255)稍暗,比文字内容亮
# 典型值Mean=242-247, Std<10
row_types = [] # 0: Unknown/Content, 1: Divider, 2: White/Empty
# 采样点:左中右
l_x, m_x, r_x = int(width * 0.05), int(width * 0.5), int(width * 0.95)
debug_rows = []
for y in range(height):
# 避开顶部和底部导航栏 (15% - 85%) - 扩大排除范围以避免误识别 Header/Footer
if y < height * 0.15 or y > height * 0.85:
row_types.append(1) # 视为无关区域 (标记为 Divider 以防止形成 Segment)
continue
row = img_data[y]
row_mean = np.mean(row)
row_std = np.std(row)
# 判定逻辑:
# 1. 纯白行 (卡片底色) -> mean > 252 (允许微小噪点)
# 2. 分割线 (Divider) -> 230 < mean < 252 且 std < 15 (颜色均匀)
# 3. 内容行 (Content) -> mean <= 230 或 (mean > 230 且 std >= 15) (有文字/图片导致方差大)
if row_mean > 252:
r_type = 2 # White/Empty
elif 230 < row_mean <= 252 and row_std < 15:
r_type = 1 # Divider
else:
r_type = 0 # Content
row_types.append(r_type)
# Debug log sampling
if y % 50 == 0:
debug_rows.append(f"Row {y}: Mean={row_mean:.1f}, Std={row_std:.1f} -> Type={r_type}")
if debug_rows:
logger.info("行特征采样 (调试用):\n" + "\n".join(debug_rows))
# 2. 聚合连续的 Content 区域 (Type 0 or Type 2 sandwiched by Type 0)
# 实际上,卡片是由 Divider 分隔开的区域。
# 我们寻找两个 Divider 之间的区域,且该区域必须包含 Content (Type 0)。
segments = []
# 简化状态机:
# 寻找非 Divider 的连续段
in_segment = False
seg_start = -1
has_content = False
raw_blocks = []
for y, r_type in enumerate(row_types):
if r_type != 1: # Not Divider
if not in_segment:
in_segment = True
seg_start = y
has_content = False
if r_type == 0:
has_content = True
else: # Is Divider
if in_segment:
# 结束一段
if has_content: # 只有包含内容的段才算
raw_blocks.append((seg_start, y))
in_segment = False
# 处理最后一段
if in_segment and has_content:
raw_blocks.append((seg_start, len(row_types)))
# 3. 过滤和后处理
# 合并距离很近的块?或者过滤太小的块
base_name = os.path.splitext(os.path.basename(image_path))[0]
valid_segments = []
logger.info(f"Initial raw blocks count: {len(raw_blocks)}")
for i, (y1, y2) in enumerate(raw_blocks):
h = y2 - y1
# 过滤太小的块 (可能是杂噪或单纯的文字行)
if h < 50:
# logger.debug(f"Block {i} too small: {h}")
continue
# 过滤太大的块 (可能是全屏错误)
if h > 1000:
continue
# 再次确认内部是否有足够的“暗像素”(内容)
# 避免切出纯白的空隙
region = img_data[y1:y2]
region_mean = np.mean(region)
if region_mean > 254.5: # 整体太白 (放宽阈值,避免误杀大面积白色的卡片)
logger.info(f" [-] 忽略区域 {i}: Y({y1}-{y2}), H={h}, 整体太白 (Mean={region_mean:.1f} > 254.5)")
continue
# --- 优化:边缘背景检查与宽度裁剪 (Margin Check & Crop) ---
# 不再直接拒绝白边,而是尝试计算内容的有效宽度
# 假设:有效内容行的方差较高,或者亮度显著不同于背景
# 简单策略:保留全宽,但增加高度限制
if h < 60: # 稍微降低阈值以测试
logger.info(f" [-] 忽略区域 {i}: Y({y1}-{y2}), H={h}, 高度不足 (<60)")
continue
# 计算左右边距的平均亮度,辅助判断(仅记录日志,不强行过滤)
margin_w = max(5, int(width * 0.05))
l_margin = img_data[y1:y2, 0:margin_w]
r_margin = img_data[y1:y2, width-margin_w:width]
l_mean = np.mean(l_margin)
r_mean = np.mean(r_margin)
# 如果是全宽卡片,左右边缘可能是白色的。
# 之前的逻辑: if l_mean > 252 or r_mean > 252: continue (导致漏检)
# 现在移除该逻辑。
# --- 新增:自动裁剪水平宽度 (Auto Horizontal Crop) ---
# 尝试找到内容的左右边界 (基于列的方差或亮度差异)
x1, x2 = 0, width
# 从左向右扫描
for x in range(0, int(width * 0.4), 2):
col = img_data[y1:y2, x]
# 如果这一列不是纯色背景 (std > 5) 或者明显比背景暗 (mean < 245)
if np.std(col) > 5 or np.mean(col) < 245:
x1 = x
break
# 从右向左扫描
for x in range(width - 1, int(width * 0.6), -2):
col = img_data[y1:y2, x]
if np.std(col) > 5 or np.mean(col) < 245:
x2 = x + 1
break
# 增加一点 padding
x1 = max(0, x1 - 10)
x2 = min(width, x2 + 10)
# 如果裁剪后宽度太小,可能不是有效卡片
if (x2 - x1) < width * 0.5:
logger.info(f" [-] 忽略区域 {i}: 裁剪后宽度过小 ({x2-x1})")
continue
# 保存
patch = img.crop((x1, y1, x2, y2))
patch_name = f"{base_name}_p{i}_{y1}.jpg"
patch_path = os.path.join(output_dir, patch_name)
patch.save(patch_path)
center_y = (y1 + y2) // 2
valid_segments.append({
"patch_path": os.path.abspath(patch_path),
"center_x": (x1 + x2) // 2,
"center_y": center_y,
"y_range": (y1, y2)
})
logger.info(f" [+] 发现卡片 {i}: Y({y1}-{y2}), H={h}, Crop X({x1}-{x2}), 边缘(L={l_mean:.1f}, R={r_mean:.1f}), 已保存")
logger.info(f"分析完成:识别到 {len(valid_segments)} 个区域")
return valid_segments
except Exception as e:
logger.error(f"图形学切片失败: {e}", exc_info=True)
return []
@staticmethod
async def recognize_card_text(patch_url):
"""
对切片进行小图 OCR 识别,提取场站详细信息
"""
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "请识别图片中的充电站信息,并以 JSON 格式输出:{\"station_name\": \"...\", \"price\": \"...\", \"piles\": \"空闲数/总数\"}。只输出 JSON不要有其他文字。"
},
{"type": "image_url", "image_url": {"url": patch_url}}
]
}
],
max_tokens=200
)
)
content = resp.choices[0].message.content.strip()
# 尝试解析 JSON
try:
import json
import re
json_match = re.search(r'\{.*\}', content, re.DOTALL)
if json_match:
return json.loads(json_match.group())
except Exception:
logger.warning(f"OCR 结果解析 JSON 失败: {content}")
return {"station_name": "未知"}
@staticmethod
async def detect_ad(image_url: str, device_info: dict = None) -> dict:
"""
检测图片中是否存在广告弹窗,并定位关闭按钮坐标
参数:
image_url: 图片地址
device_info: 设备信息
返回:
{
"has_ad": bool,
"uia_center_x": int|None,
"uia_center_y": int|None
}
"""
if device_info is None:
logger.warning("未提供动态设备信息,使用通用回退配置。")
device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO
prompt = (
"分析图片中是否存在覆盖在主界面上的广告弹窗Popup Ad"
"如果存在,请找到关闭该弹窗的按钮(通常是一个带有 'X' 的图标,可能在弹窗的右上角、右下角或正下方)。"
"仅输出JSON对象不含任何说明文字包含以下字段"
"1. has_ad: 布尔值,是否存在广告弹窗;"
"2. close_button_bounds: 关闭按钮的像素坐标或归一化坐标(0-1000) {x1,y1,x2,y2}。如果不存在广告则为 null。"
"注意只需识别最明显的那个关闭按钮。严格返回纯JSON。"
)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME_AD, # 使用更强的视觉模型
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
{"type": "text", "text": json.dumps(device_info, ensure_ascii=False)},
{"type": "text", "text": prompt},
],
},
],
)
)
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
try:
data = json.loads(raw)
has_ad = data.get("has_ad", False)
if has_ad and data.get("close_button_bounds"):
# 使用现有的辅助方法转换坐标
temp_obj = {"bounds": data.get("close_button_bounds")}
XinDianTuReadImageKit._add_click_point(temp_obj, device_info)
return {
"has_ad": True,
"uia_center_x": temp_obj.get("uia_center_x"),
"uia_center_y": temp_obj.get("uia_center_y")
}
return {"has_ad": False, "uia_center_x": None, "uia_center_y": None}
except Exception as e:
logger.error(f"Error parsing ad detection JSON: {e}")
return {"has_ad": False, "uia_center_x": None, "uia_center_y": None}
@staticmethod
async def _download_as_base64(url: str) -> str:
"""Helper to download image and convert to base64 for VL model fallback"""
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
if resp.status != 200:
return None
content = await resp.read()
if not content:
return None
return base64.b64encode(content).decode('utf-8')
except Exception as e:
logger.error(f"Failed to download image for base64 conversion: {e}")
return None
@staticmethod
async def parse_price_schedule(station_name: str, image_url: str, device_info: dict = None) -> list:
"""
解析价格时段表(整图),提取每一行的时间区间与费用信息
参数:
image_url: 图片地址(包含“当前时段电费/全部时段电费”弹窗或列表)
device_info: 设备信息,用于提升视觉理解一致性(可选)
返回:
列表,每个元素为:
{
"start": "HH:MM", 开始时间
"end": "HH:MM", 结束时间
"price_kwh": float|None, 总价或站点价(元/度)
"electric_fee_kwh": float|None, 电费(元/度)
"service_fee_kwh": float|None 服务费(元/度)
}
说明:
- 使用 qwen3-vl-flash 进行视觉解析,提示词约束输出为纯 JSON 数组
- 若某项缺失则返回 None保持结构统一
"""
if device_info is None:
logger.warning("未提供动态设备信息,使用通用回退配置。")
device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO
# 视觉解析提示词:约束输出字段与格式,避免模型输出说明文字
prompt = (
"仅输出JSON数组不含任何说明文字。识别图片中所有时段的价格信息返回每一行"
"1) start: 开始时间HH:MM2) end: 结束时间HH:MM"
"3) price_kwh: 价格(元/度,站点价或总价),"
"4) electric_fee_kwh: 电费(元/度),"
"5) service_fee_kwh: 服务费(元/度)。"
"所有数值以数字返回,例如 1.1800。若缺失某项则填 null。严格返回纯JSON数组。"
"注意:如果某行价格信息为空或表示同上,请尝试复用上一行的价格信息。"
)
loop = asyncio.get_event_loop()
def _do_request(url_val):
return XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": url_val},
},
{"type": "text", "text": json.dumps(device_info, ensure_ascii=False)},
{"type": "text", "text": prompt},
],
},
],
)
try:
# 在线程池中同步调用 OpenAI 兼容接口,避免阻塞事件循环
resp = await loop.run_in_executor(None, lambda: _do_request(image_url))
except BadRequestError as e:
# 尝试捕获 DataInspection 错误并进行 Base64 回退
err_code = getattr(e, 'code', '') or ''
if not err_code and hasattr(e, 'body') and isinstance(e.body, dict):
err_code = e.body.get('code', '')
# 如果是媒体格式或数据检查错误
if 'InvalidParameter.DataInspection' in str(err_code) or 'media format' in str(e).lower():
logger.warning(f"Image URL rejected ({err_code}). Attempting Base64 fallback: {image_url}")
base64_str = await XinDianTuReadImageKit._download_as_base64(image_url)
if base64_str:
# 简单推断格式,默认 jpeg
ext = "jpeg"
lower_url = image_url.lower()
if ".png" in lower_url:
ext = "png"
elif ".webp" in lower_url:
ext = "webp"
elif ".jpg" in lower_url or ".jpeg" in lower_url:
ext = "jpeg"
data_uri = f"data:image/{ext};base64,{base64_str}"
resp = await loop.run_in_executor(None, lambda: _do_request(data_uri))
else:
logger.error("Base64 download failed during fallback.")
return []
else:
logger.error(f"API BadRequestError: {e}")
return []
except Exception as e:
logger.error(f"API Unexpected Error: {e}")
return []
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
try:
# 反序列化模型返回的 JSON
rows = json.loads(raw)
if not isinstance(rows, list):
return []
norm = []
for r in rows:
if not isinstance(r, dict):
continue
# 兼容不同来源字段命名,统一为目标键
start = r.get("start")
end = r.get("end")
price = r.get("price_kwh")
elec = r.get("electric_fee_kwh")
serv = r.get("service_fee_kwh")
# station_name = station_name
# 将字符串数字安全转换为 float缺失则为 None
norm.append({
"station_name": station_name,
"start": start,
"end": end,
"price_kwh": float(price) if isinstance(price, (int, float, str)) and str(price) else None,
"electric_fee_kwh": float(elec) if isinstance(elec, (int, float, str)) and str(elec) else None,
"service_fee_kwh": float(serv) if isinstance(serv, (int, float, str)) and str(serv) else None,
})
for i in range(1, len(norm)):
curr = norm[i]
prev = norm[i - 1]
for k in ("price_kwh", "electric_fee_kwh", "service_fee_kwh"):
if curr.get(k) is None and prev.get(k) is not None:
curr[k] = prev[k]
return norm
except Exception as e:
logger.error(f"Error parsing JSON: {e}")
logger.error(f"Raw content: {raw}")
return []
@staticmethod
def _to_minutes(t: str) -> int:
"""
"HH:MM" 转为分钟数0-1440
说明:
- 特殊处理 "24:00" -> 1440方便区间闭合处理
- 非法格式返回 0
"""
if not t:
return 0
try:
h, m = t.split(":")
h = int(h)
m = int(m)
if h == 24 and m == 0:
return 24 * 60
return h * 60 + m
except Exception:
return 0
@staticmethod
def _fmt(t: int) -> str:
"""
将分钟数格式化为 "HH:MM"
"""
h = t // 60
m = t % 60
return f"{h:02d}:{m:02d}"
@staticmethod
def expand_schedule_to_hourly(rows: list) -> list:
"""
将时段列表按小时边界拆分
参数:
rows: parse_price_schedule 返回的时段列表
返回:
每小时一条数据的列表,区间为闭开 [start, end) 的连续小时段
说明:
- 例如 05:00-08:00 -> 05:00-06:00, 06:00-07:00, 07:00-08:00
- 保留每小时的价格、电费、服务费不变
"""
hourly = []
for r in rows:
# 起止时间转分钟
s = XinDianTuReadImageKit._to_minutes(r.get("start"))
e = XinDianTuReadImageKit._to_minutes(r.get("end"))
if e <= s:
continue
cur = s
while cur < e:
# 下一小时边界;不超过区间终点
nxt = min(e, ((cur // 60) + 1) * 60)
hourly.append({
"start": XinDianTuReadImageKit._fmt(cur),
"end": XinDianTuReadImageKit._fmt(nxt),
"price_kwh": r.get("price_kwh"),
"electric_fee_kwh": r.get("electric_fee_kwh"),
"service_fee_kwh": r.get("service_fee_kwh"),
})
cur = nxt
return hourly
@staticmethod
async def _fetch_md5(url: str) -> str:
"""
下载图片并计算 MD5用于内容去重
返回:
32位十六进制 MD5 字符串;失败返回空字符串
"""
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
if resp.status != 200:
return ""
content = await resp.read()
return hashlib.md5(content).hexdigest()
except Exception:
return ""
@staticmethod
async def parse_address(station_name: str, image_url: str, device_info: dict = None) -> dict:
"""
解析图片中的充电站地址信息,同时识别“全部时段”按钮坐标
参数:
station_name:场站名称
image_url: 图片地址
device_info: 设备信息(可选)
返回:
字典,包含 address 字段以及 uia_center_x/uia_center_y (如果找到按钮)
"""
if device_info is None:
logger.warning("未提供动态设备信息,使用通用回退配置。")
device_info = XinDianTuReadImageKit._FALLBACK_DEVICE_INFO
# 启动寻找“全部时段”按钮的任务,为后续可能的点击做准备
button_task = asyncio.create_task(XinDianTuReadImageKit.find_all_time_button_coordinate(image_url, device_info))
prompt = (
"仅输出JSON对象不含任何说明文字"
"任务1识别图片中充电站的完整名称full_station_name"
f"提示:列表中看到的名称可能是截断的(例如“{station_name}”),请在图片上方找到最匹配的完整名称。"
"任务2识别充电站的详细地址address"
"寻找规则:地址通常紧跟在场站名称下方,或者在‘距离’图标(定位小图表)附近,或者在带有‘导航’按钮的同一行。"
"返回包含 full_station_name 和 address 字段的JSON对象例如 {\"full_station_name\": \"完整名称\", \"address\": \"详细地址\"}。"
"如果找不到,对应字段返回空字符串。"
"严格返回纯JSON格式。"
)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
None,
lambda: XinDianTuReadImageKit._client.chat.completions.create(
model=VL_MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
{"type": "text", "text": json.dumps(device_info, ensure_ascii=False)},
{"type": "text", "text": prompt},
],
},
],
)
)
content = resp.choices[0].message.content or ""
raw = XinDianTuReadImageKit._extract_json(content)
result = {}
try:
data = json.loads(raw)
if isinstance(data, dict):
result = data
except Exception as e:
logger.error(f"Error parsing address JSON: {e}")
# 等待并合并按钮坐标结果
try:
button_result = await button_task
if button_result:
result.update(button_result)
except Exception as e:
logger.error(f"Error in button coordinate task: {e}")
return result
@staticmethod
async def parse_price_schedule_multi(station_name: str, image_urls: list, device_info: dict = None) -> list:
"""
多图解析(按图片内容 MD5 去重)并合并时段结果
参数:
image_urls: 多张价格表图片的 URL 列表
device_info: 设备信息(可选)
逻辑:
1. 逐张下载并计算 MD5去重得到唯一图片集合
2. 对每张唯一图片解析价格时段
3. 将所有图片的时段行合并为一个列表返回
返回:
合并后的时段列表(未按小时拆分)
"""
if not image_urls:
return []
# MD5 -> URL 的映射,用于去重
md5_to_url = {}
for u in image_urls:
m = await XinDianTuReadImageKit._fetch_md5(u)
if m and m not in md5_to_url:
md5_to_url[m] = u
unique_urls = list(md5_to_url.values())
# 合并时段结果
combined = []
for u in unique_urls:
rows = await XinDianTuReadImageKit.parse_price_schedule(station_name, u, device_info=device_info)
if rows:
combined.extend(rows)
return combined
@staticmethod
def hourly_full_day(rows: list) -> list:
"""
将时段列表规整为全天24个整点小时段
参数:
rows: 原始时段列表(可来自多图合并)
返回:
固定24条记录00:00-01:00 到 23:00-24:00
若某小时未被任何时段覆盖,则费用为 None
说明:
- 选择覆盖该小时段的时段(若多个,则选择重叠时间最长的一个)
- 保证返回结构完整,便于后续消费端显示或补全
"""
# 预处理:转换为分钟区间
intervals = []
for r in rows:
s = XinDianTuReadImageKit._to_minutes(r.get("start"))
e = XinDianTuReadImageKit._to_minutes(r.get("end"))
if e <= s:
continue
s = max(0, s)
e = min(1440, e)
intervals.append({
"s": s, "e": e,
"price_kwh": r.get("price_kwh"),
"electric_fee_kwh": r.get("electric_fee_kwh"),
"service_fee_kwh": r.get("service_fee_kwh"),
})
intervals.sort(key=lambda x: (x["s"], x["e"]))
result = []
for h in range(24):
hs = h * 60
he = (h + 1) * 60
best = None
best_overlap = 0
for it in intervals:
overlap = max(0, min(he, it["e"]) - max(hs, it["s"]))
if overlap > best_overlap:
best_overlap = overlap
best = it
result.append({
"start": XinDianTuReadImageKit._fmt(hs),
"end": XinDianTuReadImageKit._fmt(he),
"price_kwh": best["price_kwh"] if best else None,
"electric_fee_kwh": best["electric_fee_kwh"] if best else None,
"service_fee_kwh": best["service_fee_kwh"] if best else None,
})
return result
async def test1():
# 测试新电途第一层结构的读取
url = "https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/Temp/8fd79c68-fec6-4ca7-8d8e-fbff3c6862c8.jpg"
print(f"Testing First Level with URL: {url}")
result = await XinDianTuReadImageKit.parse_first_level_image_url(url)
print("First Level Result:")
print(json.dumps(result, ensure_ascii=False, indent=2))
async def test2(station_name: str):
# 测试新电途第二层结构的读取
url = "https://dsideal.obs.myhuaweicloud.com/HuangHai/Temp/SecondPage.jpg"
print(f"Testing address extraction from: {url}")
result = await XinDianTuReadImageKit.parse_address(station_name, url)
result["station_name"] = station_name
print("Address result:")
print(json.dumps(result, ensure_ascii=False, indent=2))
async def test3(station_name: str):
# 测试新电途价格表图片的解析
samples = [
"https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/Temp/XinDianTu/1.jpg",
"https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/Temp/XinDianTu/2.jpg",
]
rows = await XinDianTuReadImageKit.parse_price_schedule_multi(station_name, samples)
hourly = XinDianTuReadImageKit.hourly_full_day(rows)
print(json.dumps(hourly, ensure_ascii=False, indent=2))
if __name__ == "__main__":
#asyncio.run(test1())
station_name = '吉林省看守所充电站'
#asyncio.run(test2(station_name))
asyncio.run(test3(station_name))