This commit is contained in:
HuangHai
2026-01-13 11:08:03 +08:00
parent 3978123b88
commit 020304a411
5 changed files with 6 additions and 5 deletions

View File

@@ -1,8 +1,8 @@
# 采集配置
# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大
# 使用“小步快跑”策略,设置为 0.15 以避开广告并防止跳过场站
SCROLL_DISTANCE_RATIO = 0.15
# 稍微调大步长 (从 0.15 调至 0.25),兼顾避开广告与采集效率
SCROLL_DISTANCE_RATIO = 0.25
# 最大滑动/翻页次数,达到此次数后停止采集
MAX_SCROLLS = 100
# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集

View File

@@ -270,7 +270,7 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
if detail_ad_res:
dad_type = detail_ad_res.get("ad_type")
if dad_type == "rabbit":
logger.info(">>> 详情页检测到 rabbit 广告,判定为误报,忽略")
logger.info(">>> 详情页检测到 rabbit 广告。根据策略,我们不再尝试关闭它,直接继续")
else:
dx, dy = detail_ad_res["x"], detail_ad_res["y"]
logger.info(f"检测到详情页弹窗: {dad_type},正在点击关闭 ({dx}, {dy})...")
@@ -395,8 +395,9 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
if stations and new_stations_processed == 0:
no_new_data_count += 1
logger.info(f"本页所有场站均已处理过,连续 {no_new_data_count} 页无新数据。")
if no_new_data_count >= 3:
logger.info("连续 3 页无新数据,判定为已到底或重复循环,提前结束。")
# 【优化】由于滑动步长较小 (0.15),连续多页重复是正常的,将阈值调大到 10
if no_new_data_count >= 10:
logger.info("连续 10 页无新数据,判定为已到底或重复循环,提前结束。")
break
else:
no_new_data_count = 0