This commit is contained in:
HuangHai
2026-01-13 10:57:30 +08:00
parent 7c1864b995
commit 3978123b88
10 changed files with 73 additions and 29 deletions

View File

@@ -1,8 +1,8 @@
# 采集配置
# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大
# 参考场站卡片高度(约屏幕 18-20%),设置为 0.2 以便每次精确翻页一个场站
SCROLL_DISTANCE_RATIO = 0.22
# 使用“小步快跑”策略,设置为 0.15 以避开广告并防止跳过场站
SCROLL_DISTANCE_RATIO = 0.15
# 最大滑动/翻页次数,达到此次数后停止采集
MAX_SCROLLS = 100
# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集

View File

@@ -164,9 +164,11 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
image_uuid = str(uuid.uuid4())
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
# 检查是否存在广告 (VL) - 仅处理非兔子广告
# 检查是否存在广告 (VL)
logger.info("检查是否存在广告弹窗...")
ad_res = await ReadImageKit.detect_ad_popup(screenshot_path, device_info=device_info)
ad_top_y_norm = 0.90 # 默认使用一个更宽松的底部边界 (0.90),防止点到导航栏
if ad_res:
ad_type = ad_res.get("ad_type")
if ad_type != "rabbit":
@@ -178,7 +180,10 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
if os.path.exists(screenshot_path): os.remove(screenshot_path)
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
else:
logger.info(">>> 检测到兔子广告。根据策略不再处理,直接开始识别场站。")
# 发现兔子广告,不再尝试关闭,直接计算它的顶边界 (归一化 0-1)
# 我们取 0.90 作为安全阈值,低于此 Y 坐标的场站认为被遮挡
ad_top_y_norm = 0.90
logger.info(f">>> 发现兔子广告,设定安全边界为 Y_norm < {ad_top_y_norm}")
# 检查是否已经滚动到底部 (排除状态栏后,内容与上次一致)
current_md5 = Kit.get_image_content_md5(
@@ -234,12 +239,19 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
logger.info(f"场站 {station_name} 匹配到已处理记录,跳过。")
continue
# 【策略调整】每页仅处理第一个识别到的新场站,确保其不被底部广告遮挡
logger.info(f">>> 发现新场站 '{station_name}'。根据最新策略,每页仅处理第一个场站以避开广告。")
# 【优化】检查是否被遮挡或太靠近底部
# card["bounds_norm"] 是 {left, top, right, bottom}
card_bottom = card["bounds_norm"]["bottom"]
if card_bottom > ad_top_y_norm:
logger.warning(f"场站 '{station_name}' 被遮挡或太靠近底部 (Bottom {card_bottom:.2f} > {ad_top_y_norm}),留待下次滚动处理。")
continue
# 正常处理新场站
logger.info(f">>> 发现新场站 '{station_name}',开始处理...")
new_stations_processed += 1
click_x, click_y = card["click_point"]
logger.info(f"准备处理{idx + 1}场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
logger.info(f"准备处理场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
d.click(int(click_x), int(click_y))
@@ -248,11 +260,26 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
should_back_to_list = True
# 截取二级页面图
detail_uuid = str(uuid.uuid4())
detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)
# 【新增】二级页面广告检测 (如:免费停车提示)
logger.info("检查详情页是否存在干扰弹窗...")
detail_ad_res = await ReadImageKit.detect_ad_popup(detail_path, device_info=device_info)
if detail_ad_res:
dad_type = detail_ad_res.get("ad_type")
if dad_type == "rabbit":
logger.info(">>> 详情页检测到 rabbit 广告,判定为误报,忽略。")
else:
dx, dy = detail_ad_res["x"], detail_ad_res["y"]
logger.info(f"检测到详情页弹窗: {dad_type},正在点击关闭 ({dx}, {dy})...")
d.click(dx, dy)
await asyncio.sleep(1.5)
# 重新截图,确保后续流程使用的是清理后的界面
if os.path.exists(detail_path): os.remove(detail_path)
detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)
# 【优化】后台解析详情页地址,直接传本地路径,避免等待上传
logger.info(f"已启动后台分析详情页: {station_name}")
task_addr = asyncio.create_task(service.process_station_address(station_name, detail_path, device_info=device_info))
@@ -351,17 +378,13 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
# 从二级页面返回 (仅当确实需要返回时)
if should_back_to_list:
d.press("back")
logger.info(f"等待 {WAIT_BACK_TO_LIST} 秒返回列表...")
await asyncio.sleep(WAIT_BACK_TO_LIST)
logger.info(f"等待 {WAIT_BACK_TO_LIST + 1} 秒返回列表...")
await asyncio.sleep(WAIT_BACK_TO_LIST + 1)
# 记录 Redis 去重 (仅按名称去重)
cleaned = Kit.clean_station_name(station_name)
await redis_kit.set_data(f"crawled:xdt:{cleaned}", "1", expire=REDIS_STATION_EXPIRE)
# 【策略执行】处理完第一个新场站后立即退出循环,触发翻页
logger.info(f"已完成本页首个新场站 '{station_name}' 的处理,准备翻页。")
break
# 清理已完成的后台任务
done_tasks = [t for t in background_tasks if t.done()]
for t in done_tasks:
@@ -379,7 +402,9 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
no_new_data_count = 0
# 6. 翻页
logger.info("执行翻页滑动...")
# 【优化】使用“小步快跑”策略,减小滑动距离以避免跳过场站,并能更平滑地躲避广告
# 步长已在 Setting.py 中统一配置为 SCROLL_DISTANCE_RATIO
logger.info(f"执行翻页滑动 (步长: {SCROLL_DISTANCE_RATIO})...")
d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
await asyncio.sleep(WAIT_AFTER_SCROLL)

View File

@@ -204,8 +204,8 @@ def detect_rabbit_ad_close(image_path, debug_dir=None):
cX = int(M["m10"] / M["m00"]) + roi_x1
cY = int(M["m01"] / M["m00"]) + roi_y1
norm_x = int(cX / w * 1000)
norm_y = int(cY / h * 1000)
norm_x = cX / w
norm_y = cY / h
# 避免重复
if not any(abs(cX - c[0]) < 15 and abs(cY - c[1]) < 15 for c in candidates):
@@ -220,17 +220,23 @@ def detect_rabbit_ad_close(image_path, debug_dir=None):
has_x = c[5]
# 基础分:如果有 X大幅加分
score = 1000 if has_x else 0
# 距离分:越靠近预期的 (93, 830) 分越高
dist = np.sqrt((c[3] - 93)**2 + (c[4] - 830)**2)
score -= dist * 2
# 距离分:越靠近预期的 (0.094, 0.830) 分越高
dist = np.sqrt((c[3] - 0.094)**2 + (c[4] - 0.830)**2)
score -= dist * 2000 # 归一化后距离变小,需加大权重
# 面积分:理想面积在 500-1500 之间
if 500 < c[2] < 1500: score += 200
return score
candidates.sort(key=score_candidate, reverse=True)
best = candidates[0]
best_score = score_candidate(best)
logger.info(f"CV detected rabbit ad close button at Norm({best[3]}, {best[4]}) with score {score_candidate(best):.2f}")
logger.info(f"CV detected rabbit ad close button at Norm({best[3]:.3f}, {best[4]:.3f}) with score {best_score:.2f}")
# 【优化】如果得分太低 (低于 850),说明误判概率较大,不予返回
if best_score < 850:
logger.info(f"Score {best_score:.2f} is below threshold 850, ignoring candidate.")
return None
if debug_dir:
os.makedirs(debug_dir, exist_ok=True)
@@ -749,13 +755,19 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
# 使用 Median 比 Max 更稳健,避免被单个超高卡片(如广告)带偏
threshold_h = median_h * 0.70
for (y1, y2), h in zip(temp_valid_segments, heights):
if h < threshold_h:
for (y1, y2), card_h in zip(temp_valid_segments, heights):
# 【优化】过滤顶部非场站区域 (例如会员图标、搜索栏等)
# 场站列表通常在屏幕 35% 高度以后
if y1 / h < 0.35:
logger.info(f" Filtering out segment Y={y1}-{y2} because it's too high up (Top {y1/h:.2f} < 0.35).")
continue
if card_h < threshold_h:
logger.info(
f" Filtering out segment Y={y1}-{y2} (H={h}) because it's too short (Threshold={threshold_h:.1f}).")
f" Filtering out segment Y={y1}-{y2} (H={card_h}) because it's too short (Threshold={threshold_h:.1f}).")
else:
final_cards.append((y1, y2, final_x1, final_x2))
logger.info(f" Card: Y={y1}-{y2}, X={final_x1}-{final_x2}, H={h}")
logger.info(f" Card: Y={y1}-{y2}, X={final_x1}-{final_x2}, H={card_h}")
# 4. 保存结果
if output_dir is None:
@@ -812,6 +824,12 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
card_info = {
"id": idx + 1,
"rect": [x1, y1, x2, y2],
"bounds_norm": {
"left": x1 / w,
"top": y1 / h,
"right": x2 / w,
"bottom": y2 / h
},
"click_point": [click_x, click_y]
}
json_data["cards"].append(card_info)

View File

@@ -55,7 +55,7 @@ async def check_and_close_ad(d):
logger.info(f"检测到广告关闭按钮: ({x}, {y}) [Type: {ad_type}]")
if ad_type == "rabbit":
logger.info(">>> 检测到兔子广告。根据最新策略,我们不再尝试关闭它,直接继续。")
logger.info(">>> 检测到兔子广告。根据策略,我们不再尝试关闭它,直接继续。")
if os.path.exists(screenshot_path): os.remove(screenshot_path)
return True
else:

View File

@@ -89,6 +89,7 @@ class ReadImageKit:
cv_rabbit_point = detect_rabbit_ad_close(image_path, debug_dir=TEMP_IMAGE_DIR)
if cv_rabbit_point:
logger.info(f">>> 图形学算法精准捕捉到兔子广告关闭按钮: {cv_rabbit_point}")
# 注意Kit.py 的 detect_rabbit_ad_close 现在返回的是 0-1.0 归一化坐标
norm_point = cv_rabbit_point
ad_type = "rabbit"
@@ -97,8 +98,8 @@ class ReadImageKit:
device_info = cls._FALLBACK_DEVICE_INFO
w = device_info.get("displayWidth", FALLBACK_WIDTH)
h = device_info.get("displayHeight", FALLBACK_HEIGHT)
x = int(norm_point[0] / 1000 * w)
y = int(norm_point[1] / 1000 * h)
x = int(norm_point[0] * w)
y = int(norm_point[1] * h)
return {"x": x, "y": y, "ad_type": ad_type}
# 2. 如果图形学没找到,再请求大模型进行全量检测
@@ -110,7 +111,7 @@ class ReadImageKit:
prompt = (
"请仔细检查这张图片中是否存在**弹窗广告**或**悬浮广告**。\n"
"广告可能有以下几种形式:\n"
"1. **屏幕中央的大型弹窗广告**:通常遮挡了页面内容,内容多为优惠券、活动推广\n"
"1. **屏幕中央的大型弹窗广告**:通常遮挡了页面内容,内容多为优惠券、活动推广、或“免费停车”类提示例如现在可免费停车N小时了\n"
"2. **'新电兔AI'专属悬浮广告**(高频):一个卡通兔子头,带有'新电兔AI'字样关闭按钮是一个黑色圆圈内含白色X。\n"
"3. **底部的横幅广告**:带有明显的关闭按钮。\n\n"
"请返回关闭按钮的中心坐标。\n\n"