aiData/Apps/AiTeJiYiChong/Crawler.py

# coding=utf-8
import asyncio
import logging
import uuid
import os
import sys
import time
from PIL import Image

# 将项目根目录添加到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
    sys.path.append(project_root)

import uiautomator2 as u2
from Apps.AiTeJiYiChong import Kit
from Apps.AiTeJiYiChong.Kit import take_screenshot
from Apps.AiTeJiYiChong.ReadImageKit import ReadImageKit
from Util.RedisKit import RedisKit
from Apps.AiTeJiYiChong.Service import AiTeJiYiChongService
from Config.Config import TEMP_IMAGE_DIR
from Apps.AiTeJiYiChong.Config.Setting import (
    SCROLL_DISTANCE_RATIO,
    MAX_SCROLLS, REDIS_STATION_EXPIRE,
    WAIT_AFTER_SCROLL,
    MAX_CRAWL_DISTANCE
)

logger = logging.getLogger("AiTeJiYiChongCrawler")

async def is_already_crawled(redis_kit, station_name):
    """
    检查场站是否已经爬取过，仅按名称去重
    """
    cleaned_name = Kit.clean_station_name(station_name)
    if not cleaned_name:
        return False

    redis_key = f"crawled:aite:{cleaned_name}"
    if await redis_kit.get_data(redis_key):
        return True
    return False

async def get_station_list(d, service, max_scrolls=MAX_SCROLLS):
    """
    获取场站列表并处理翻页
    """
    redis_kit = RedisKit()
    window_size = d.window_size()
    w, h = window_size[0], window_size[1]

    device_info = d.info
    device_info['width'] = w
    device_info['height'] = h

    logger.info(f"开始爬取列表，设备: {device_info.get('productName')} | 分辨率: {w}x{h}")

    # 用于追踪后台分析任务
    background_tasks = []

    for i in range(max_scrolls + 1):
        logger.info(f"正在处理第 {i + 1} 页...")
        # ... (前序步骤保持不变)

        # (在此处截断，以便后续替换循环体内的逻辑)


        # 1. 拍摄截图
        image_uuid = str(uuid.uuid4())
        screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
        logger.info(f"列表页截图已完成: {screenshot_path}")

        # 2. 执行图形学分析，生成 _flag.jpg, _vl.jpg 和 .json
        logger.info("正在执行图形学切片分析...")
        json_data = Kit.crop_cards_from_image(screenshot_path)

        # 3. 调用混合模式识别 (图形学切片 + 本地 OCR)
        stations = await service.process_station_list_hybrid(screenshot_path, device_info=device_info)
        logger.info(f"本页识别到 {len(stations)} 个场站")

        # 4. 匹配几何卡片与识别结果 (混合模式下已经包含在 stations 中，但为了兼容旧逻辑进行填充)
        if json_data and json_data.get("cards") and stations:
            for card in json_data["cards"]:
                card_rect = card["rect"] # [x1, y1, x2, y2]
                for st in stations:
                    st_bounds = st.get("bounds") # [x1, y1, x2, y2] (0-1000)
                    if not st_bounds: continue

                    # 转换 VL 坐标到像素坐标
                    st_y1_px = st_bounds[1] * h / 1000
                    st_y2_px = st_bounds[3] * h / 1000

                    # 计算 y 轴重叠
                    overlap = min(card_rect[3], st_y2_px) - max(card_rect[1], st_y1_px)
                    if overlap > 50: # 有显著重叠
                        card["station_name"] = st.get("station_name")
                        logger.info(f"匹配成功: 几何卡片 {card_rect} -> 场站 {card['station_name']}")
                        break

        # 5. 遍历处理本页所有场站
        if json_data and json_data.get("cards"):
            for card_idx, card in enumerate(json_data["cards"]):
                station_name = card.get("station_name")
                if not station_name:
                    logger.warning(f"第 {card_idx + 1} 个几何卡片未匹配到场站名称，跳过。")
                    continue

                # 检查 Redis 去重 (使用新的模糊匹配逻辑)
                if await is_already_crawled(redis_kit, station_name):
                    logger.info(f"场站 {station_name} 匹配到已处理记录，跳过。")
                    continue

                click_x, click_y = card["click_point"]
                logger.info(f"准备处理第 {card_idx + 1} 个场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
                d.click(click_x, click_y)

                # 等待二级页面加载
                await asyncio.sleep(3)

                # 截取二级页面图
                detail_uuid = f"detail_{station_name}_{image_uuid}"
                detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)

                # 【异步优化】后台解析详情页基础数据，不阻塞 UI 流程
                logger.info(f"已启动后台分析详情页: {station_name}")

                async def detail_task_wrapper(p, name):
                    try:
                        await service.process_station_detail(p, station_name=name)
                        if os.path.exists(p): os.remove(p)
                    except Exception as ex:
                        logger.error(f"详情页分析异常 ({name}): {ex}")

                task_detail = asyncio.create_task(detail_task_wrapper(detail_path, station_name))
                background_tasks.append(task_detail)

                # 6. 寻找 timePrice.jpg 图标并进入三级页面 (分时价格页)
                time_price_template = os.path.join(os.path.dirname(__file__), "BiaoShi", "timePrice.jpg")
                coords = Kit.find_template_coords(detail_path, time_price_template)

                if coords:
                    cx, cy, conf = coords
                    logger.info(f"找到分时价格图标，进入三级页面...")
                    d.click(cx, cy)
                    await asyncio.sleep(2) # 等待加载

                    # 抓取三级页面图片（包括可能的滑动）
                    price_screenshots = []
                    price_detail_uuid = f"price_detail_{station_name}_{image_uuid}"
                    price_path = take_screenshot(d, price_detail_uuid, save_dir=TEMP_IMAGE_DIR)
                    price_screenshots.append(price_path)

                    # 如果需要滑动，先完成所有截图动作
                    last_md5 = Kit.get_file_md5(price_path)
                    for scroll_idx in range(2): # 减少滑动次数以加快速度，通常1-2次足够
                        d.swipe(w // 2, int(h * 0.7), w // 2, int(h * 0.3), duration=0.4)
                        await asyncio.sleep(1)
                        scroll_path = take_screenshot(d, f"price_scroll_{scroll_idx}_{station_name}", save_dir=TEMP_IMAGE_DIR)
                        curr_md5 = Kit.get_file_md5(scroll_path)
                        if curr_md5 == last_md5: break
                        price_screenshots.append(scroll_path)
                        last_md5 = curr_md5

                    # 【异步优化】后台处理所有价格图片，不阻塞 UI 返回
                    logger.info(f"已启动后台分析价格页: {station_name}，共 {len(price_screenshots)} 张图")
                    task_price = asyncio.create_task(analyze_prices_background(service, station_name, price_screenshots))
                    background_tasks.append(task_price)

                    # 立即从三级页面返回
                    d.press("back")
                    await asyncio.sleep(0.5)
                else:
                    logger.warning(f"场站 {station_name} 未找到分时价格入口。")

                # 从二级页面返回列表页
                d.press("back")
                await asyncio.sleep(0.8)

                # 记录 Redis (仅按名称去重，不再使用前缀索引)
                full_name_key = f"crawled:aite:{Kit.clean_station_name(station_name)}"
                await redis_kit.set_data(full_name_key, "1", expire=REDIS_STATION_EXPIRE)

        # 在每一页结束时，清理已完成的任务，防止内存堆积
        done_tasks = [t for t in background_tasks if t.done()]
        for t in done_tasks:
            if t.exception():
                logger.error(f"后台任务执行异常: {t.exception()}")
            background_tasks.remove(t)

        # 7. 列表页翻页滑动
        logger.info("执行翻页滑动...")
        start_x, start_y = w // 2, int(h * 0.8)
        end_x, end_y = w // 2, int(h * (0.8 - SCROLL_DISTANCE_RATIO))
        d.swipe(start_x, start_y, end_x, end_y, duration=0.5)
        await asyncio.sleep(WAIT_AFTER_SCROLL)

    # 最终等待所有后台分析任务完成
    if background_tasks:
        logger.info(f"正在等待剩余 {len(background_tasks)} 个后台分析任务完成...")
        await asyncio.gather(*background_tasks, return_exceptions=True)

    logger.info("达到最大翻页次数，爬取结束。")
    return True

async def analyze_prices_background(service, station_name, image_paths):
    """
    后台异步处理价格图片分析
    """
    try:
        all_prices = []
        for path in image_paths:
            prices = await ReadImageKit.get_price_detail_from_image(path)
            if prices and isinstance(prices, list):
                for p in prices:
                    if not isinstance(p, dict): continue
                    if not any(isinstance(np_i, dict) and np_i.get("start") == p.get("start") for np_i in all_prices):
                        all_prices.append(p)

        if all_prices:
            all_prices = [p for p in all_prices if isinstance(p, dict)]
            all_prices.sort(key=lambda x: x.get("start", "00:00"))
            hourly_schedule = ReadImageKit.expand_schedule_to_24h(all_prices)
            await service.process_price_detail_data(station_name, hourly_schedule)
            logger.info(f"后台处理完成: {station_name} 的分时价格已保存。")

        # 处理完成后清理图片，释放磁盘空间
        for path in image_paths:
            try:
                if os.path.exists(path):
                    os.remove(path)
                    logger.debug(f"已删除已处理的图片: {path}")
            except Exception as e:
                logger.warning(f"删除图片失败: {path}, {e}")

    except Exception as e:
        logger.error(f"分析价格后台任务异常 ({station_name}): {e}")
    return True


async def main(service=None, do_cleanup=True):
    """
    爬虫主入口
    """
    if do_cleanup:
        Kit.clear_temp_dir()

    if service is None:
        service = AiTeJiYiChongService()
        await service.init_db()

    d = u2.connect()

    try:
        await get_station_list(d, service)
        return True
    except Exception as e:
        logger.error(f"爬取过程中出现异常: {e}")
        return False
    finally:
        # 如果是内部初始化的 service，则在此关闭
        pass

if __name__ == "__main__":
    asyncio.run(main())