'commit'
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
|
||||
# 采集配置
|
||||
# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大
|
||||
# 参考场站卡片高度(约屏幕 18-20%),设置为 0.2 以便每次精确翻页一个场站
|
||||
SCROLL_DISTANCE_RATIO = 0.22
|
||||
# 使用“小步快跑”策略,设置为 0.15 以避开广告并防止跳过场站
|
||||
SCROLL_DISTANCE_RATIO = 0.15
|
||||
# 最大滑动/翻页次数,达到此次数后停止采集
|
||||
MAX_SCROLLS = 100
|
||||
# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集
|
||||
|
||||
Binary file not shown.
@@ -164,9 +164,11 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
image_uuid = str(uuid.uuid4())
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
|
||||
# 检查是否存在广告 (VL) - 仅处理非兔子广告
|
||||
# 检查是否存在广告 (VL)
|
||||
logger.info("检查是否存在广告弹窗...")
|
||||
ad_res = await ReadImageKit.detect_ad_popup(screenshot_path, device_info=device_info)
|
||||
|
||||
ad_top_y_norm = 0.90 # 默认使用一个更宽松的底部边界 (0.90),防止点到导航栏
|
||||
if ad_res:
|
||||
ad_type = ad_res.get("ad_type")
|
||||
if ad_type != "rabbit":
|
||||
@@ -178,7 +180,10 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
else:
|
||||
logger.info(">>> 检测到兔子广告。根据策略不再处理,直接开始识别场站。")
|
||||
# 发现兔子广告,不再尝试关闭,直接计算它的顶边界 (归一化 0-1)
|
||||
# 我们取 0.90 作为安全阈值,低于此 Y 坐标的场站认为被遮挡
|
||||
ad_top_y_norm = 0.90
|
||||
logger.info(f">>> 发现兔子广告,设定安全边界为 Y_norm < {ad_top_y_norm}")
|
||||
|
||||
# 检查是否已经滚动到底部 (排除状态栏后,内容与上次一致)
|
||||
current_md5 = Kit.get_image_content_md5(
|
||||
@@ -234,12 +239,19 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
logger.info(f"场站 {station_name} 匹配到已处理记录,跳过。")
|
||||
continue
|
||||
|
||||
# 【策略调整】每页仅处理第一个识别到的新场站,确保其不被底部广告遮挡
|
||||
logger.info(f">>> 发现新场站 '{station_name}'。根据最新策略,每页仅处理第一个场站以避开广告。")
|
||||
# 【优化】检查是否被遮挡或太靠近底部
|
||||
# card["bounds_norm"] 是 {left, top, right, bottom}
|
||||
card_bottom = card["bounds_norm"]["bottom"]
|
||||
if card_bottom > ad_top_y_norm:
|
||||
logger.warning(f"场站 '{station_name}' 被遮挡或太靠近底部 (Bottom {card_bottom:.2f} > {ad_top_y_norm}),留待下次滚动处理。")
|
||||
continue
|
||||
|
||||
# 正常处理新场站
|
||||
logger.info(f">>> 发现新场站 '{station_name}',开始处理...")
|
||||
new_stations_processed += 1
|
||||
|
||||
click_x, click_y = card["click_point"]
|
||||
logger.info(f"准备处理第 {idx + 1} 个场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
|
||||
logger.info(f"准备处理场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
|
||||
|
||||
d.click(int(click_x), int(click_y))
|
||||
|
||||
@@ -248,11 +260,26 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
|
||||
should_back_to_list = True
|
||||
|
||||
|
||||
# 截取二级页面图
|
||||
detail_uuid = str(uuid.uuid4())
|
||||
detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
|
||||
# 【新增】二级页面广告检测 (如:免费停车提示)
|
||||
logger.info("检查详情页是否存在干扰弹窗...")
|
||||
detail_ad_res = await ReadImageKit.detect_ad_popup(detail_path, device_info=device_info)
|
||||
if detail_ad_res:
|
||||
dad_type = detail_ad_res.get("ad_type")
|
||||
if dad_type == "rabbit":
|
||||
logger.info(">>> 详情页检测到 rabbit 广告,判定为误报,忽略。")
|
||||
else:
|
||||
dx, dy = detail_ad_res["x"], detail_ad_res["y"]
|
||||
logger.info(f"检测到详情页弹窗: {dad_type},正在点击关闭 ({dx}, {dy})...")
|
||||
d.click(dx, dy)
|
||||
await asyncio.sleep(1.5)
|
||||
# 重新截图,确保后续流程使用的是清理后的界面
|
||||
if os.path.exists(detail_path): os.remove(detail_path)
|
||||
detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
|
||||
# 【优化】后台解析详情页地址,直接传本地路径,避免等待上传
|
||||
logger.info(f"已启动后台分析详情页: {station_name}")
|
||||
task_addr = asyncio.create_task(service.process_station_address(station_name, detail_path, device_info=device_info))
|
||||
@@ -351,17 +378,13 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
# 从二级页面返回 (仅当确实需要返回时)
|
||||
if should_back_to_list:
|
||||
d.press("back")
|
||||
logger.info(f"等待 {WAIT_BACK_TO_LIST} 秒返回列表...")
|
||||
await asyncio.sleep(WAIT_BACK_TO_LIST)
|
||||
logger.info(f"等待 {WAIT_BACK_TO_LIST + 1} 秒返回列表...")
|
||||
await asyncio.sleep(WAIT_BACK_TO_LIST + 1)
|
||||
|
||||
# 记录 Redis 去重 (仅按名称去重)
|
||||
cleaned = Kit.clean_station_name(station_name)
|
||||
await redis_kit.set_data(f"crawled:xdt:{cleaned}", "1", expire=REDIS_STATION_EXPIRE)
|
||||
|
||||
# 【策略执行】处理完第一个新场站后立即退出循环,触发翻页
|
||||
logger.info(f"已完成本页首个新场站 '{station_name}' 的处理,准备翻页。")
|
||||
break
|
||||
|
||||
# 清理已完成的后台任务
|
||||
done_tasks = [t for t in background_tasks if t.done()]
|
||||
for t in done_tasks:
|
||||
@@ -379,7 +402,9 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
no_new_data_count = 0
|
||||
|
||||
# 6. 翻页
|
||||
logger.info("执行翻页滑动...")
|
||||
# 【优化】使用“小步快跑”策略,减小滑动距离以避免跳过场站,并能更平滑地躲避广告
|
||||
# 步长已在 Setting.py 中统一配置为 SCROLL_DISTANCE_RATIO
|
||||
logger.info(f"执行翻页滑动 (步长: {SCROLL_DISTANCE_RATIO})...")
|
||||
d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
|
||||
await asyncio.sleep(WAIT_AFTER_SCROLL)
|
||||
|
||||
|
||||
@@ -204,8 +204,8 @@ def detect_rabbit_ad_close(image_path, debug_dir=None):
|
||||
cX = int(M["m10"] / M["m00"]) + roi_x1
|
||||
cY = int(M["m01"] / M["m00"]) + roi_y1
|
||||
|
||||
norm_x = int(cX / w * 1000)
|
||||
norm_y = int(cY / h * 1000)
|
||||
norm_x = cX / w
|
||||
norm_y = cY / h
|
||||
|
||||
# 避免重复
|
||||
if not any(abs(cX - c[0]) < 15 and abs(cY - c[1]) < 15 for c in candidates):
|
||||
@@ -220,17 +220,23 @@ def detect_rabbit_ad_close(image_path, debug_dir=None):
|
||||
has_x = c[5]
|
||||
# 基础分:如果有 X,大幅加分
|
||||
score = 1000 if has_x else 0
|
||||
# 距离分:越靠近预期的 (93, 830) 分越高
|
||||
dist = np.sqrt((c[3] - 93)**2 + (c[4] - 830)**2)
|
||||
score -= dist * 2
|
||||
# 距离分:越靠近预期的 (0.094, 0.830) 分越高
|
||||
dist = np.sqrt((c[3] - 0.094)**2 + (c[4] - 0.830)**2)
|
||||
score -= dist * 2000 # 归一化后距离变小,需加大权重
|
||||
# 面积分:理想面积在 500-1500 之间
|
||||
if 500 < c[2] < 1500: score += 200
|
||||
return score
|
||||
|
||||
candidates.sort(key=score_candidate, reverse=True)
|
||||
best = candidates[0]
|
||||
best_score = score_candidate(best)
|
||||
|
||||
logger.info(f"CV detected rabbit ad close button at Norm({best[3]}, {best[4]}) with score {score_candidate(best):.2f}")
|
||||
logger.info(f"CV detected rabbit ad close button at Norm({best[3]:.3f}, {best[4]:.3f}) with score {best_score:.2f}")
|
||||
|
||||
# 【优化】如果得分太低 (低于 850),说明误判概率较大,不予返回
|
||||
if best_score < 850:
|
||||
logger.info(f"Score {best_score:.2f} is below threshold 850, ignoring candidate.")
|
||||
return None
|
||||
|
||||
if debug_dir:
|
||||
os.makedirs(debug_dir, exist_ok=True)
|
||||
@@ -749,13 +755,19 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
|
||||
# 使用 Median 比 Max 更稳健,避免被单个超高卡片(如广告)带偏
|
||||
threshold_h = median_h * 0.70
|
||||
|
||||
for (y1, y2), h in zip(temp_valid_segments, heights):
|
||||
if h < threshold_h:
|
||||
for (y1, y2), card_h in zip(temp_valid_segments, heights):
|
||||
# 【优化】过滤顶部非场站区域 (例如会员图标、搜索栏等)
|
||||
# 场站列表通常在屏幕 35% 高度以后
|
||||
if y1 / h < 0.35:
|
||||
logger.info(f" Filtering out segment Y={y1}-{y2} because it's too high up (Top {y1/h:.2f} < 0.35).")
|
||||
continue
|
||||
|
||||
if card_h < threshold_h:
|
||||
logger.info(
|
||||
f" Filtering out segment Y={y1}-{y2} (H={h}) because it's too short (Threshold={threshold_h:.1f}).")
|
||||
f" Filtering out segment Y={y1}-{y2} (H={card_h}) because it's too short (Threshold={threshold_h:.1f}).")
|
||||
else:
|
||||
final_cards.append((y1, y2, final_x1, final_x2))
|
||||
logger.info(f" Card: Y={y1}-{y2}, X={final_x1}-{final_x2}, H={h}")
|
||||
logger.info(f" Card: Y={y1}-{y2}, X={final_x1}-{final_x2}, H={card_h}")
|
||||
|
||||
# 4. 保存结果
|
||||
if output_dir is None:
|
||||
@@ -812,6 +824,12 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
|
||||
card_info = {
|
||||
"id": idx + 1,
|
||||
"rect": [x1, y1, x2, y2],
|
||||
"bounds_norm": {
|
||||
"left": x1 / w,
|
||||
"top": y1 / h,
|
||||
"right": x2 / w,
|
||||
"bottom": y2 / h
|
||||
},
|
||||
"click_point": [click_x, click_y]
|
||||
}
|
||||
json_data["cards"].append(card_info)
|
||||
|
||||
@@ -55,7 +55,7 @@ async def check_and_close_ad(d):
|
||||
logger.info(f"检测到广告关闭按钮: ({x}, {y}) [Type: {ad_type}]")
|
||||
|
||||
if ad_type == "rabbit":
|
||||
logger.info(">>> 检测到兔子广告。根据最新策略,我们不再尝试关闭它,直接继续。")
|
||||
logger.info(">>> 检测到兔子广告。根据策略,我们不再尝试关闭它,直接继续。")
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
return True
|
||||
else:
|
||||
|
||||
@@ -89,6 +89,7 @@ class ReadImageKit:
|
||||
cv_rabbit_point = detect_rabbit_ad_close(image_path, debug_dir=TEMP_IMAGE_DIR)
|
||||
if cv_rabbit_point:
|
||||
logger.info(f">>> 图形学算法精准捕捉到兔子广告关闭按钮: {cv_rabbit_point}")
|
||||
# 注意:Kit.py 的 detect_rabbit_ad_close 现在返回的是 0-1.0 归一化坐标
|
||||
norm_point = cv_rabbit_point
|
||||
ad_type = "rabbit"
|
||||
|
||||
@@ -97,8 +98,8 @@ class ReadImageKit:
|
||||
device_info = cls._FALLBACK_DEVICE_INFO
|
||||
w = device_info.get("displayWidth", FALLBACK_WIDTH)
|
||||
h = device_info.get("displayHeight", FALLBACK_HEIGHT)
|
||||
x = int(norm_point[0] / 1000 * w)
|
||||
y = int(norm_point[1] / 1000 * h)
|
||||
x = int(norm_point[0] * w)
|
||||
y = int(norm_point[1] * h)
|
||||
return {"x": x, "y": y, "ad_type": ad_type}
|
||||
|
||||
# 2. 如果图形学没找到,再请求大模型进行全量检测
|
||||
@@ -110,7 +111,7 @@ class ReadImageKit:
|
||||
prompt = (
|
||||
"请仔细检查这张图片中是否存在**弹窗广告**或**悬浮广告**。\n"
|
||||
"广告可能有以下几种形式:\n"
|
||||
"1. **屏幕中央的大型弹窗广告**:通常遮挡了页面内容,内容多为优惠券、活动推广等。\n"
|
||||
"1. **屏幕中央的大型弹窗广告**:通常遮挡了页面内容,内容多为优惠券、活动推广、或“免费停车”类提示(例如:现在可免费停车N小时了!)。\n"
|
||||
"2. **'新电兔AI'专属悬浮广告**(高频):一个卡通兔子头,带有'新电兔AI'字样,关闭按钮是一个黑色圆圈内含白色X。\n"
|
||||
"3. **底部的横幅广告**:带有明显的关闭按钮。\n\n"
|
||||
"请返回关闭按钮的中心坐标。\n\n"
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user