This commit is contained in:
HuangHai
2026-01-13 14:54:02 +08:00
parent c36ab00318
commit 8815cf603e
4 changed files with 6 additions and 6 deletions

View File

@@ -1,6 +1,6 @@
# 采集配置
SCROLL_DISTANCE_RATIO = 0.4
SCROLL_DISTANCE_RATIO = 0.6
MAX_SCROLLS = 100
MAX_CRAWL_DISTANCE = 50
REDIS_STATION_EXPIRE = 120

View File

@@ -1,8 +1,8 @@
# 采集配置
# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大
# 稍微调大步长 (从 0.15 调至 0.25)兼顾避开广告与采集效率
SCROLL_DISTANCE_RATIO = 0.25
# 调大步长 (从 0.25 调至 0.5)提高采集效率
SCROLL_DISTANCE_RATIO = 0.5
# 最大滑动/翻页次数,达到此次数后停止采集
MAX_SCROLLS = 60
# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集

View File

@@ -417,9 +417,9 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
if stations and new_stations_processed == 0:
no_new_data_count += 1
logger.info(f"本页所有场站均已处理过,连续 {no_new_data_count} 页无新数据。")
# 【优化】由于滑动步长较小 (0.15),连续多页重复是正常的,将阈值调大到 10
if no_new_data_count >= 10:
logger.info("连续 10 页无新数据,判定为已到底或重复循环,提前结束。")
# 【优化】由于滑动步长已调大 (0.5),连续多页重复的可能性降低
if no_new_data_count >= 5:
logger.info("连续 5 页无新数据,判定为已到底或重复循环,提前结束。")
break
else:
no_new_data_count = 0