This commit is contained in:
HuangHai
2026-01-17 08:49:42 +08:00
parent b12c9de620
commit a34083cd47
10 changed files with 74 additions and 89 deletions

View File

@@ -1,15 +1,15 @@
# 采集配置
SCROLL_DISTANCE_RATIO = 0.5
MAX_STATIONS_COUNT = 1
FIRST_RUN_ONLY_ONE_STATION = False
MAX_STATIONS_COUNT = 2
FIRST_RUN_ONLY_ONE_STATION = True
REDIS_STATION_EXPIRE = 120
DATA_RETENTION_DAYS = 365
# 等待时间配置 (秒)
WAIT_DETAIL_PAGE_LOAD = 1.5
WAIT_DETAIL_PAGE_LOAD = 2.5
WAIT_BACK_TO_LIST = 0.5
WAIT_AFTER_SCROLL = 2.0
WAIT_AFTER_SCROLL = 1.0
# 坐标计算与安全防护
SAFE_EXCLUDE_RATIO = 0.40

View File

@@ -31,7 +31,8 @@ from Apps.XinDianTu.Config.Setting import (
MAX_STATIONS_COUNT, REDIS_STATION_EXPIRE,
WAIT_DETAIL_PAGE_LOAD, WAIT_BACK_TO_LIST, WAIT_AFTER_SCROLL,
SAFE_EXCLUDE_RATIO,
BOTTOM_SAFE_EXCLUDE_RATIO
BOTTOM_SAFE_EXCLUDE_RATIO,
FIRST_RUN_ONLY_ONE_STATION,
)
# --- 用户配置区域 ---
@@ -49,19 +50,9 @@ TEST_CLEAR_REDIS = True
# 请尝试减小 SCROLL_DISTANCE_RATIO例如设置为 0.4 或 0.3)。
# 这样每次滑动的距离变短,可以确保所有场站都能被完整显示并识别。
# 配置日志输出
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("StationList")
from Apps.XinDianTu.Kit import setup_logger
# 强制设置所有相关模块的日志级别为 INFO防止被第三方库干扰
logging.getLogger("OpenXinDianTu").setLevel(logging.INFO)
logging.getLogger("FullProcess").setLevel(logging.INFO)
logger = setup_logger("StationList")
class XinDianTuCrawler(BaseCrawler):
@@ -168,6 +159,9 @@ async def get_station_list(d, service, uploader, max_stations_count=MAX_STATIONS
# 【优化】不再在每页滚动时检查兔子广告,仅在 Opener 进入时检查一次
# 如果后续发现有其它非兔子广告弹出,可在此处恢复非兔子广告的检测逻辑
ad_top_y_norm = 0.78 # 默认的点击边界 (0.78)
# 新策略:滚动后的页面完全依赖“速度躲兔子”,不再因为靠近底部而丢弃卡片
if scroll_count > 1:
ad_top_y_norm = 1.0
# 检查是否已经滚动到底部 (排除状态栏后,内容与上次一致)
current_md5 = Kit.get_image_content_md5(
@@ -212,6 +206,11 @@ async def get_station_list(d, service, uploader, max_stations_count=MAX_STATIONS
new_stations_processed_in_page = 0
if json_metadata.get("cards") and stations:
for idx, card in enumerate(json_metadata["cards"]):
# 首屏策略:只处理第一个场站,其余留待滚动后在安全窗口内处理
if FIRST_RUN_ONLY_ONE_STATION and scroll_count == 1 and idx > 0:
logger.info("首屏仅处理第一个场站,跳过当前卡片。")
continue
# 检查是否已达到最大采集数量(按新采集的场站数量限制)
if total_new_processed_count >= max_stations_count:
break

View File

@@ -810,6 +810,46 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
temp_valid_segments.append((y1, y2))
# 对过长的段进行内部切分,避免将两个卡片合并为一个
refined_segments = []
SPLIT_GAP_MIN = 8
SPLIT_MARGIN = 6
GAP_STD_STRICT = max(0.0, GAP_STD_MAX - 3.0)
for y1, y2 in temp_valid_segments:
split_points = []
run_len = 0
run_start = None
for yy in range(y1 + SPLIT_MARGIN, y2 - SPLIT_MARGIN):
rm = row_means[yy]
rs = row_stds[yy]
em = edge_means[yy]
is_gap_line = (GAP_MEAN_MIN <= rm <= GAP_MEAN_MAX) and (rs < GAP_STD_STRICT) and (BG_GRAY_MIN <= em <= BG_GRAY_MAX)
if is_gap_line:
if run_len == 0:
run_start = yy
run_len += 1
else:
if run_len >= SPLIT_GAP_MIN:
sp = (run_start + yy) // 2
split_points.append(sp)
run_len = 0
run_start = None
if run_len >= SPLIT_GAP_MIN and run_start is not None:
sp = (run_start + (y2 - SPLIT_MARGIN)) // 2
split_points.append(sp)
if split_points:
prev = y1
for sp in split_points:
if sp - prev > 100:
refined_segments.append((prev, sp))
prev = sp
if y2 - prev > 100:
refined_segments.append((prev, y2))
else:
refined_segments.append((y1, y2))
temp_valid_segments = refined_segments
# 计算统一宽度
if not candidate_x1:
logger.info(" No valid width detected. Using default.")
@@ -877,67 +917,20 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
# 准备 _vl.jpg (只画框,不画红点)
vl_img = img.copy()
# 预先计算“列表浮动区域”的蓝色矩形坐标
rabbit_x1 = int(w * 0.04)
rabbit_x2 = int(w * 0.96)
rabbit_y1 = int(h * 0.74)
rabbit_y2 = int(h * 0.86)
# 在调试图上画出蓝色矩形
try:
cv2.rectangle(debug_img, (rabbit_x1, rabbit_y1), (rabbit_x2, rabbit_y2), (255, 0, 0), 3)
cv2.rectangle(vl_img, (rabbit_x1, rabbit_y1), (rabbit_x2, rabbit_y2), (255, 0, 0), 3)
logger.info(f" 标记列表浮动蓝色区域: X={rabbit_x1}-{rabbit_x2}, Y={rabbit_y1}-{rabbit_y2}")
except Exception as e:
logger.warning(f"标记兔子广告蓝色区域失败: {e}")
# 基于蓝色矩形过滤与列表浮动区域有交集的场站卡片
if final_cards:
logger.info(f" [蓝框过滤] 初始卡片数量: {len(final_cards)}")
logger.info(
f" [蓝框过滤] 蓝框坐标: X={rabbit_x1}-{rabbit_x2}, Y={rabbit_y1}-{rabbit_y2}"
)
def _intersects(card):
y1, y2, x1, x2 = card
cx1, cy1, cx2, cy2 = x1, y1, x2, y2
ix1 = max(cx1, rabbit_x1)
iy1 = max(cy1, rabbit_y1)
ix2 = min(cx2, rabbit_x2)
iy2 = min(cy2, rabbit_y2)
return ix1 < ix2 and iy1 < iy2
original_cards = list(final_cards)
filtered_cards = []
for idx, card in enumerate(original_cards, start=1):
y1, y2, x1, x2 = card
if _intersects(card):
logger.info(
f" [蓝框过滤] 丢弃卡片#{idx}: X={x1}-{x2}, Y={y1}-{y2} (与蓝框有交集)"
)
else:
logger.info(
f" [蓝框过滤] 保留卡片#{idx}: X={x1}-{x2}, Y={y1}-{y2} (与蓝框无交集)"
)
filtered_cards.append(card)
logger.info(
f" [蓝框过滤] 最终保留 {len(filtered_cards)} 个卡片, 丢弃 {len(original_cards) - len(filtered_cards)}"
)
final_cards = filtered_cards
else:
logger.info(" [蓝框过滤] 没有可过滤的卡片。")
logger.info(f" Step [2.1/VL] 准备在 VL 图片上绘制 {len(final_cards)} 个场站的绿色方框...")
for idx, (y1, y2, x1, x2) in enumerate(final_cards):
# 轻微向上扩展卡片上边界,避免漏掉标题区域
PAD_TOP = 5
draw_y1 = max(0, y1 - PAD_TOP)
draw_y2 = y2
# 计算点击点 (左上角,避免被底部按钮遮挡)
# 策略X偏移 15%, Y偏移 20%
w_card = x2 - x1
h_card = y2 - y1
h_card = draw_y2 - draw_y1
click_x = int(x1 + w_card * 0.15)
click_y = int(y1 + h_card * 0.20)
click_y = int(draw_y1 + h_card * 0.20)
# [修改] 不再保存单张子图,只记录元数据
# card = img[y1:y2, x1:x2]
@@ -952,28 +945,28 @@ def crop_cards_from_image(img_path, output_dir=None, save_debug=True):
# 在标记图上画红点 (实心圆, 半径10, 红色BGR)
cv2.circle(debug_img, (click_x, click_y), 10, (0, 0, 255), -1)
# [修改] 必须画绿框,因为后续视觉模型依赖这个框来识别范围
cv2.rectangle(debug_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.rectangle(debug_img, (x1, draw_y1), (x2, draw_y2), (0, 255, 0), 2)
# 在 _vl 图上只画绿框
cv2.rectangle(vl_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.rectangle(vl_img, (x1, draw_y1), (x2, draw_y2), (0, 255, 0), 2)
# 收集 JSON 数据
card_info = {
"id": idx + 1,
"rect": [x1, y1, x2, y2],
"rect": [x1, draw_y1, x2, draw_y2],
"bounds_norm": {
"left": x1 / w,
"top": y1 / h,
"top": draw_y1 / h,
"right": x2 / w,
"bottom": y2 / h
"bottom": draw_y2 / h
},
"click_point": [click_x, click_y]
}
json_data["cards"].append(card_info)
# 记录区域信息供调用者使用 (如果需要)
# 格式: (None, (click_x, click_y), (x1, y1, x2, y2))
results.append((None, (click_x, click_y), (x1, y1, x2, y2)))
# 格式: (None, (click_x, click_y), (x1, draw_y1, x2, draw_y2))
results.append((None, (click_x, click_y), (x1, draw_y1, x2, draw_y2)))
# [删除] 之前生成的单张 _for_vl.jpg 逻辑已移除

View File

@@ -1,23 +1,17 @@
# coding=utf-8
import asyncio
import logging
import os
import time
import uuid
import uiautomator2 as u2
from Apps.XinDianTu.Kit import take_screenshot
from Apps.XinDianTu.Kit import take_screenshot, setup_logger
from Apps.XinDianTu.ReadImageKit import ReadImageKit
from Config.Config import TEMP_IMAGE_DIR
# pip install adbutils
# 配置日志输出,方便调试和监控
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("OpenXinDianTu")
logger = setup_logger("OpenXinDianTu")
# 获取当前脚本所在目录
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

View File

@@ -1,5 +1,4 @@
import hashlib
import logging
import os
import re
import sys
@@ -10,6 +9,7 @@ project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(_
if project_root not in sys.path:
sys.path.append(project_root)
from Apps.XinDianTu.Kit import setup_logger
from Apps.XinDianTu.ReadImageKit import ReadImageKit
from DbKit.Db import Db
from Config.Config import DB_URL, PRICE_FLATTEN_TO_24H_GLOBAL
@@ -18,8 +18,7 @@ from Model.StationStatus import StationStatus
from Model.StationPriceSchedule import StationPriceSchedule
from Apps.XinDianTu.Config.Setting import PRICE_FLATTEN_TO_24H
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger = setup_logger("Service")
class XinDianTuService:
def __init__(self):