'commit'
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
|
||||
# 采集配置
|
||||
# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大,建议 0.3-0.5 以避免错过中间内容
|
||||
SCROLL_DISTANCE_RATIO = 0.3
|
||||
# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大
|
||||
# 参考场站卡片高度(约屏幕 18-20%),设置为 0.2 以便每次精确翻页一个场站
|
||||
SCROLL_DISTANCE_RATIO = 0.22
|
||||
# 最大滑动/翻页次数,达到此次数后停止采集
|
||||
MAX_SCROLLS = 100
|
||||
# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集
|
||||
|
||||
Binary file not shown.
@@ -164,38 +164,21 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
image_uuid = str(uuid.uuid4())
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
|
||||
# 1.5 检查是否存在广告 (VL)
|
||||
# 检查是否存在广告 (VL) - 仅处理非兔子广告
|
||||
logger.info("检查是否存在广告弹窗...")
|
||||
ad_res = await ReadImageKit.detect_ad_popup(screenshot_path, device_info=device_info)
|
||||
if ad_res:
|
||||
ad_type = ad_res.get("ad_type")
|
||||
logger.info(f"检测到广告弹窗,准备关闭: {ad_res} [Type: {ad_type}]")
|
||||
|
||||
x, y = ad_res['x'], ad_res['y']
|
||||
|
||||
if ad_type == "rabbit":
|
||||
logger.info(">>> 检测到顽固的兔子广告,执行强力双击策略...")
|
||||
d.double_click(x, y, duration=0.05)
|
||||
|
||||
# 处理“误触背景卡片”的逻辑
|
||||
logger.info(">>> 等待 3s 以确认是否进入了二级页面...")
|
||||
await asyncio.sleep(3.0)
|
||||
|
||||
# 检查是否还在列表页 (新电途列表页通常有“全城”或“综合排序”按钮)
|
||||
# 如果这些按钮不存在,说明可能进到了详情页
|
||||
is_list_page = d(text="全城").exists or d(text="综合排序").exists or d(text="搜索场站").exists
|
||||
if not is_list_page:
|
||||
logger.info(">>> 检测到当前不在列表页,尝试执行返回操作...")
|
||||
d.press("back")
|
||||
await asyncio.sleep(1.5)
|
||||
else:
|
||||
logger.info(">>> 当前仍在列表页,无需返回。")
|
||||
else:
|
||||
if ad_type != "rabbit":
|
||||
x, y = ad_res['x'], ad_res['y']
|
||||
logger.info(f"检测到非兔子广告: {ad_res} [Type: {ad_type}],正在关闭...")
|
||||
d.click(x, y)
|
||||
await asyncio.sleep(1.5)
|
||||
# 重新截图
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
# 重新截图
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
|
||||
else:
|
||||
logger.info(">>> 检测到兔子广告。根据策略不再处理,直接开始识别场站。")
|
||||
|
||||
# 检查是否已经滚动到底部 (排除状态栏后,内容与上次一致)
|
||||
current_md5 = Kit.get_image_content_md5(
|
||||
@@ -251,7 +234,10 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
logger.info(f"场站 {station_name} 匹配到已处理记录,跳过。")
|
||||
continue
|
||||
|
||||
# 【策略调整】每页仅处理第一个识别到的新场站,确保其不被底部广告遮挡
|
||||
logger.info(f">>> 发现新场站 '{station_name}'。根据最新策略,每页仅处理第一个场站以避开广告。")
|
||||
new_stations_processed += 1
|
||||
|
||||
click_x, click_y = card["click_point"]
|
||||
logger.info(f"准备处理第 {idx + 1} 个场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
|
||||
|
||||
@@ -331,8 +317,6 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
# 2. 与详情页对比 (使用默认排除比例,与 before_click_md5 一致)
|
||||
check_detail_md5 = Kit.get_image_content_md5(check_back_path)
|
||||
|
||||
if os.path.exists(check_back_path): os.remove(check_back_path)
|
||||
|
||||
if check_list_md5 == current_md5:
|
||||
logger.info("检测到已直接返回列表页,跳过后续返回操作。")
|
||||
should_back_to_list = False
|
||||
@@ -352,6 +336,9 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
else:
|
||||
logger.info("VL 未发现'全部时段'按钮,判定已回到列表页(或非详情页),停止回退以防误退。")
|
||||
should_back_to_list = False
|
||||
|
||||
# 使用完毕后再清理检查用的截图
|
||||
if os.path.exists(check_back_path): os.remove(check_back_path)
|
||||
|
||||
else:
|
||||
logger.warning(f"点击 '{station_name}' 的 '全部时段' 按钮后页面无明显变化,跳过三级页面处理。")
|
||||
@@ -364,12 +351,17 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
|
||||
# 从二级页面返回 (仅当确实需要返回时)
|
||||
if should_back_to_list:
|
||||
d.press("back")
|
||||
logger.info(f"等待 {WAIT_BACK_TO_LIST} 秒返回列表...")
|
||||
await asyncio.sleep(WAIT_BACK_TO_LIST)
|
||||
|
||||
# 记录 Redis 去重 (仅按名称去重)
|
||||
cleaned = Kit.clean_station_name(station_name)
|
||||
await redis_kit.set_data(f"crawled:xdt:{cleaned}", "1", expire=REDIS_STATION_EXPIRE)
|
||||
|
||||
# 【策略执行】处理完第一个新场站后立即退出循环,触发翻页
|
||||
logger.info(f"已完成本页首个新场站 '{station_name}' 的处理,准备翻页。")
|
||||
break
|
||||
|
||||
# 清理已完成的后台任务
|
||||
done_tasks = [t for t in background_tasks if t.done()]
|
||||
for t in done_tasks:
|
||||
|
||||
@@ -24,84 +24,59 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
async def check_and_close_ad(d):
|
||||
"""
|
||||
检测并关闭广告弹窗(仅使用 VL 视觉模型)
|
||||
检测并关闭广告弹窗(循环检测直至无广告)
|
||||
"""
|
||||
logger.info("开始检测广告弹窗 (VL方案)...")
|
||||
|
||||
# [已移除] 策略:先向上滑动一点,触发悬浮广告(如兔子)收起/躲避
|
||||
# 用户反馈:一滚动,兔子就藏起来了,导致无法检测到广告,因此取消此逻辑。
|
||||
# try:
|
||||
# w, h = d.window_size()
|
||||
# logger.info(f"执行微小滑动 (Swipe Up),尝试触发悬浮广告收起...")
|
||||
# # 从 70% 处滑到 50% 处,模拟手指上滑,页面内容上移
|
||||
# d.swipe(w * 0.5, h * 0.7, w * 0.5, h * 0.5, duration=0.3)
|
||||
# # 等待滑动动画结束和广告收起动画
|
||||
# await asyncio.sleep(0.5)
|
||||
# except Exception as e:
|
||||
# logger.warning(f"滑动操作异常: {e}")
|
||||
|
||||
# 1. 拍摄截图
|
||||
t1 = time.time()
|
||||
image_uuid = str(uuid.uuid4())
|
||||
# 使用相对路径: 基于当前脚本目录下的 Images 文件夹
|
||||
save_dir = TEMP_IMAGE_DIR
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir)
|
||||
logger.info(f"Step [广告检测截图] 耗时: {time.time() - t1:.4f}s")
|
||||
|
||||
# 2. 视觉大模型检测方案 (VL Model)
|
||||
try:
|
||||
window_size = d.window_size()
|
||||
device_info = {
|
||||
"width": window_size[0],
|
||||
"height": window_size[1],
|
||||
"productName": d.info.get('productName', 'unknown')
|
||||
}
|
||||
max_loops = 3 # 最大检测 3 轮,防止死循环
|
||||
for loop_idx in range(max_loops):
|
||||
logger.info(f"开始第 {loop_idx + 1} 轮广告检测...")
|
||||
|
||||
# 使用最新的 detect_ad_popup 方法
|
||||
ad_result = await ReadImageKit.detect_ad_popup(screenshot_path, device_info=device_info)
|
||||
if ad_result:
|
||||
x, y = ad_result["x"], ad_result["y"]
|
||||
ad_type = ad_result.get("ad_type")
|
||||
# 1. 拍摄截图
|
||||
t1 = time.time()
|
||||
image_uuid = str(uuid.uuid4())
|
||||
save_dir = TEMP_IMAGE_DIR
|
||||
screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir)
|
||||
logger.info(f"Step [广告检测截图] 耗时: {time.time() - t1:.4f}s")
|
||||
|
||||
# 2. 视觉大模型检测方案 (VL Model)
|
||||
try:
|
||||
window_size = d.window_size()
|
||||
device_info = {
|
||||
"width": window_size[0],
|
||||
"height": window_size[1],
|
||||
"productName": d.info.get('productName', 'unknown')
|
||||
}
|
||||
|
||||
logger.info(f"通过视觉大模型检测到广告关闭按钮: ({x}, {y}) [Type: {ad_type}]")
|
||||
|
||||
# 针对“新电兔AI”兔子广告的强力关闭策略
|
||||
if ad_type == "rabbit":
|
||||
logger.info(">>> 检测到顽固的兔子广告,执行强力双击策略...")
|
||||
d.double_click(x, y, duration=0.05)
|
||||
# 使用最新的 detect_ad_popup 方法
|
||||
ad_result = await ReadImageKit.detect_ad_popup(screenshot_path, device_info=device_info)
|
||||
if ad_result:
|
||||
x, y = ad_result["x"], ad_result["y"]
|
||||
ad_type = ad_result.get("ad_type")
|
||||
|
||||
# 处理“误触背景卡片”的逻辑
|
||||
logger.info(">>> 等待 3s 以确认是否进入了二级页面...")
|
||||
await asyncio.sleep(3.0)
|
||||
logger.info(f"检测到广告关闭按钮: ({x}, {y}) [Type: {ad_type}]")
|
||||
|
||||
# 检查是否还在列表页
|
||||
is_list_page = d(text="全城").exists or d(text="综合排序").exists or d(text="搜索场站").exists
|
||||
if not is_list_page:
|
||||
logger.info(">>> 检测到当前不在列表页,尝试执行返回操作...")
|
||||
d.press("back")
|
||||
await asyncio.sleep(1.5)
|
||||
if ad_type == "rabbit":
|
||||
logger.info(">>> 检测到兔子广告。根据最新策略,我们不再尝试关闭它,直接继续。")
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
return True
|
||||
else:
|
||||
logger.info(">>> 当前仍在列表页,无需返回。")
|
||||
logger.info(f">>> 正在点击关闭非兔子广告 ({ad_type})...")
|
||||
d.click(x, y)
|
||||
await asyncio.sleep(2.0) # 等待关闭动画
|
||||
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
# 继续下一轮循环,检查是否还有其他广告
|
||||
continue
|
||||
else:
|
||||
d.click(x, y)
|
||||
|
||||
if os.path.exists(screenshot_path):
|
||||
os.remove(screenshot_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"视觉大模型广告检测异常: {e}")
|
||||
logger.info("本轮未检测到广告。")
|
||||
if os.path.exists(screenshot_path):
|
||||
os.remove(screenshot_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"视觉大模型广告检测异常: {e}")
|
||||
if os.path.exists(screenshot_path): os.remove(screenshot_path)
|
||||
break
|
||||
|
||||
logger.info(f"VL 检测完成,未发现已知广告。")
|
||||
|
||||
# 如果未检测到广告,保留截图以便调试(保存为 debug_ad_failure.jpg)
|
||||
debug_path = os.path.join(os.path.dirname(screenshot_path), "debug_ad_failure.jpg")
|
||||
if os.path.exists(debug_path):
|
||||
os.remove(debug_path)
|
||||
if os.path.exists(screenshot_path):
|
||||
os.rename(screenshot_path, debug_path)
|
||||
logger.info(f"未检测到广告,截图已保存至: {debug_path} (请检查该图片是否真的包含广告)")
|
||||
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -173,6 +173,24 @@ class ReadImageKit:
|
||||
logger.error(f"Failed to detect ad popup: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _to_minutes(t_str: str) -> int:
|
||||
"""HH:MM -> 分钟数"""
|
||||
if not t_str or ":" not in t_str:
|
||||
return 0
|
||||
try:
|
||||
h, m = map(int, t_str.split(":"))
|
||||
return h * 60 + m
|
||||
except:
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _fmt(t: int) -> str:
|
||||
"""分钟数 -> HH:MM"""
|
||||
h = t // 60
|
||||
m = t % 60
|
||||
return f"{h:02d}:{m:02d}"
|
||||
|
||||
@staticmethod
|
||||
def _extract_json(text: str) -> str:
|
||||
if not text:
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user