Files
aiData/Apps/AiTeJiYiChong/Crawler.py
HuangHai ac79e44282 'commit'
2026-01-12 20:11:18 +08:00

265 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import asyncio
import logging
import uuid
import os
import sys
import time
from PIL import Image
# 将项目根目录添加到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
sys.path.append(project_root)
import uiautomator2 as u2
from Apps.AiTeJiYiChong import Kit
from Apps.AiTeJiYiChong.Kit import take_screenshot
from Apps.AiTeJiYiChong.ReadImageKit import ReadImageKit
from Util.RedisKit import RedisKit
from Apps.AiTeJiYiChong.Service import AiTeJiYiChongService
from Config.Config import TEMP_IMAGE_DIR
from Apps.AiTeJiYiChong.Config.Setting import (
SCROLL_DISTANCE_RATIO,
MAX_SCROLLS, REDIS_STATION_EXPIRE,
WAIT_AFTER_SCROLL,
MAX_CRAWL_DISTANCE
)
logger = logging.getLogger("AiTeJiYiChongCrawler")
async def is_already_crawled(redis_kit, station_name):
"""
检查场站是否已经爬取过,仅按名称去重
"""
cleaned_name = Kit.clean_station_name(station_name)
if not cleaned_name:
return False
redis_key = f"crawled:aite:{cleaned_name}"
if await redis_kit.get_data(redis_key):
return True
return False
async def get_station_list(d, service, max_scrolls=MAX_SCROLLS):
"""
获取场站列表并处理翻页
"""
redis_kit = RedisKit()
window_size = d.window_size()
w, h = window_size[0], window_size[1]
device_info = d.info
device_info['width'] = w
device_info['height'] = h
logger.info(f"开始爬取列表,设备: {device_info.get('productName')} | 分辨率: {w}x{h}")
# 用于追踪后台分析任务
background_tasks = []
for i in range(max_scrolls + 1):
logger.info(f"正在处理第 {i + 1} 页...")
# ... (前序步骤保持不变)
# (在此处截断,以便后续替换循环体内的逻辑)
# 1. 拍摄截图
image_uuid = str(uuid.uuid4())
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
logger.info(f"列表页截图已完成: {screenshot_path}")
# 2. 执行图形学分析,生成 _flag.jpg, _vl.jpg 和 .json
logger.info("正在执行图形学切片分析...")
json_data = Kit.crop_cards_from_image(screenshot_path)
# 3. 调用混合模式识别 (图形学切片 + 本地 OCR)
stations = await service.process_station_list_hybrid(screenshot_path, device_info=device_info)
logger.info(f"本页识别到 {len(stations)} 个场站")
# 4. 匹配几何卡片与识别结果 (混合模式下已经包含在 stations 中,但为了兼容旧逻辑进行填充)
if json_data and json_data.get("cards") and stations:
for card in json_data["cards"]:
card_rect = card["rect"] # [x1, y1, x2, y2]
for st in stations:
st_bounds = st.get("bounds") # [x1, y1, x2, y2] (0-1000)
if not st_bounds: continue
# 转换 VL 坐标到像素坐标
st_y1_px = st_bounds[1] * h / 1000
st_y2_px = st_bounds[3] * h / 1000
# 计算 y 轴重叠
overlap = min(card_rect[3], st_y2_px) - max(card_rect[1], st_y1_px)
if overlap > 50: # 有显著重叠
card["station_name"] = st.get("station_name")
logger.info(f"匹配成功: 几何卡片 {card_rect} -> 场站 {card['station_name']}")
break
# 5. 遍历处理本页所有场站
if json_data and json_data.get("cards"):
for card_idx, card in enumerate(json_data["cards"]):
station_name = card.get("station_name")
if not station_name:
logger.warning(f"{card_idx + 1} 个几何卡片未匹配到场站名称,跳过。")
continue
# 检查 Redis 去重 (使用新的模糊匹配逻辑)
if await is_already_crawled(redis_kit, station_name):
logger.info(f"场站 {station_name} 匹配到已处理记录,跳过。")
continue
click_x, click_y = card["click_point"]
logger.info(f"准备处理第 {card_idx + 1} 个场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
d.click(click_x, click_y)
# 等待二级页面加载
await asyncio.sleep(3)
# 截取二级页面图
detail_uuid = f"detail_{station_name}_{image_uuid}"
detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)
# 【异步优化】后台解析详情页基础数据,不阻塞 UI 流程
logger.info(f"已启动后台分析详情页: {station_name}")
async def detail_task_wrapper(p, name):
try:
await service.process_station_detail(p, station_name=name)
if os.path.exists(p): os.remove(p)
except Exception as ex:
logger.error(f"详情页分析异常 ({name}): {ex}")
task_detail = asyncio.create_task(detail_task_wrapper(detail_path, station_name))
background_tasks.append(task_detail)
# 6. 寻找 timePrice.jpg 图标并进入三级页面 (分时价格页)
time_price_template = os.path.join(os.path.dirname(__file__), "BiaoShi", "timePrice.jpg")
coords = Kit.find_template_coords(detail_path, time_price_template)
if coords:
cx, cy, conf = coords
logger.info(f"找到分时价格图标,进入三级页面...")
d.click(cx, cy)
await asyncio.sleep(2) # 等待加载
# 抓取三级页面图片(包括可能的滑动)
price_screenshots = []
price_detail_uuid = f"price_detail_{station_name}_{image_uuid}"
price_path = take_screenshot(d, price_detail_uuid, save_dir=TEMP_IMAGE_DIR)
price_screenshots.append(price_path)
# 如果需要滑动,先完成所有截图动作
last_md5 = Kit.get_file_md5(price_path)
for scroll_idx in range(2): # 减少滑动次数以加快速度通常1-2次足够
d.swipe(w // 2, int(h * 0.7), w // 2, int(h * 0.3), duration=0.4)
await asyncio.sleep(1)
scroll_path = take_screenshot(d, f"price_scroll_{scroll_idx}_{station_name}", save_dir=TEMP_IMAGE_DIR)
curr_md5 = Kit.get_file_md5(scroll_path)
if curr_md5 == last_md5: break
price_screenshots.append(scroll_path)
last_md5 = curr_md5
# 【异步优化】后台处理所有价格图片,不阻塞 UI 返回
logger.info(f"已启动后台分析价格页: {station_name},共 {len(price_screenshots)} 张图")
task_price = asyncio.create_task(analyze_prices_background(service, station_name, price_screenshots))
background_tasks.append(task_price)
# 立即从三级页面返回
d.press("back")
await asyncio.sleep(0.5)
else:
logger.warning(f"场站 {station_name} 未找到分时价格入口。")
# 从二级页面返回列表页
d.press("back")
await asyncio.sleep(0.8)
# 记录 Redis (仅按名称去重,不再使用前缀索引)
full_name_key = f"crawled:aite:{Kit.clean_station_name(station_name)}"
await redis_kit.set_data(full_name_key, "1", expire=REDIS_STATION_EXPIRE)
# 在每一页结束时,清理已完成的任务,防止内存堆积
done_tasks = [t for t in background_tasks if t.done()]
for t in done_tasks:
if t.exception():
logger.error(f"后台任务执行异常: {t.exception()}")
background_tasks.remove(t)
# 7. 列表页翻页滑动
logger.info("执行翻页滑动...")
start_x, start_y = w // 2, int(h * 0.8)
end_x, end_y = w // 2, int(h * (0.8 - SCROLL_DISTANCE_RATIO))
d.swipe(start_x, start_y, end_x, end_y, duration=0.5)
await asyncio.sleep(WAIT_AFTER_SCROLL)
# 最终等待所有后台分析任务完成
if background_tasks:
logger.info(f"正在等待剩余 {len(background_tasks)} 个后台分析任务完成...")
await asyncio.gather(*background_tasks, return_exceptions=True)
logger.info("达到最大翻页次数,爬取结束。")
return True
async def analyze_prices_background(service, station_name, image_paths):
"""
后台异步处理价格图片分析
"""
try:
all_prices = []
for path in image_paths:
prices = await ReadImageKit.get_price_detail_from_image(path)
if prices and isinstance(prices, list):
for p in prices:
if not isinstance(p, dict): continue
if not any(isinstance(np_i, dict) and np_i.get("start") == p.get("start") for np_i in all_prices):
all_prices.append(p)
if all_prices:
all_prices = [p for p in all_prices if isinstance(p, dict)]
all_prices.sort(key=lambda x: x.get("start", "00:00"))
hourly_schedule = ReadImageKit.expand_schedule_to_24h(all_prices)
await service.process_price_detail_data(station_name, hourly_schedule)
logger.info(f"后台处理完成: {station_name} 的分时价格已保存。")
# 处理完成后清理图片,释放磁盘空间
for path in image_paths:
try:
if os.path.exists(path):
os.remove(path)
logger.debug(f"已删除已处理的图片: {path}")
except Exception as e:
logger.warning(f"删除图片失败: {path}, {e}")
except Exception as e:
logger.error(f"分析价格后台任务异常 ({station_name}): {e}")
return True
async def main(service=None, do_cleanup=True):
"""
爬虫主入口
"""
if do_cleanup:
Kit.clear_temp_dir()
if service is None:
service = AiTeJiYiChongService()
await service.init_db()
d = u2.connect()
try:
await get_station_list(d, service)
return True
except Exception as e:
logger.error(f"爬取过程中出现异常: {e}")
return False
finally:
# 如果是内部初始化的 service则在此关闭
pass
if __name__ == "__main__":
asyncio.run(main())