aiData/Apps/AiTeJiYiChong/Crawler.py

# coding=utf-8
import asyncio
import logging
import uuid
import os
import sys
import time
from datetime import datetime
from PIL import Image

# 将项目根目录添加到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
    sys.path.append(project_root)

import uiautomator2 as u2
from Apps.AiTeJiYiChong import Kit
from Apps.AiTeJiYiChong.Kit import take_screenshot
from Apps.AiTeJiYiChong.ReadImageKit import ReadImageKit
from Util.RedisKit import RedisKit
from Apps.AiTeJiYiChong.Service import AiTeJiYiChongService
from Config.Config import TEMP_IMAGE_DIR
from Apps.AiTeJiYiChong.Config.Setting import (
    SCROLL_DISTANCE_RATIO,
    MAX_STATIONS_COUNT, REDIS_STATION_EXPIRE,
    WAIT_AFTER_SCROLL,
    SAFE_EXCLUDE_RATIO,
    BOTTOM_SAFE_EXCLUDE_RATIO,
    TEST_CLEAR_REDIS
)

logger = logging.getLogger("AiTeJiYiChongCrawler")

async def clean_redis_data(redis_kit):
    """
    [Testing] 清除 Redis 中的已爬取记录
    """
    if not TEST_CLEAR_REDIS:
        return

    logger.warning("!!! [Testing] 正在清除 Redis 中的艾特吉易充已爬取记录...")
    keys = await redis_kit.keys("crawled:aite:*")
    if keys:
        for key in keys:
            await redis_kit.delete(key)
        logger.info(f"已清除 {len(keys)} 条记录。")
    else:
        logger.info("Redis 中没有找到相关记录。")

async def is_already_crawled(redis_kit, station_name):
    """
    检查场站是否已经爬取过，仅按名称去重
    """
    cleaned_name = Kit.clean_station_name(station_name)
    if not cleaned_name:
        return False

    redis_key = f"crawled:aite:{cleaned_name}"
    if await redis_kit.get_data(redis_key):
        return True
    return False

async def get_station_list(d, service, max_stations_count=MAX_STATIONS_COUNT):
    """
    获取场站列表并处理翻页
    """
    redis_kit = RedisKit()
    window_size = d.window_size()
    w, h = window_size[0], window_size[1]

    device_info = d.info
    device_info['width'] = w
    device_info['height'] = h

    logger.info(f"开始爬取列表，设备: {device_info.get('productName')} | 分辨率: {w}x{h} | 目标数量: {max_stations_count}")

    # 用于追踪后台分析任务
    background_tasks = []
    last_list_md5 = None
    no_new_data_count = 0
    total_encountered_count = 0
    total_new_processed_count = 0
    scroll_count = 0

    while total_encountered_count < max_stations_count:
        scroll_count += 1
        logger.info(f"正在处理第 {scroll_count} 次滚动 (已遇到: {total_encountered_count}/{max_stations_count}, 新采集: {total_new_processed_count})...")

        # 1. 拍摄截图
        image_uuid = str(uuid.uuid4())
        screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)

        # 检查是否已经滚动到底部 (排除状态栏后，内容与上次一致)
        current_md5 = Kit.get_image_content_md5(
            screenshot_path,
            top_ratio=SAFE_EXCLUDE_RATIO,
            bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO
        )
        if last_list_md5 and current_md5 == last_list_md5:
            logger.info("检测到列表核心内容无变化，已到达底部或加载完成，提前结束。")
            if os.path.exists(screenshot_path): os.remove(screenshot_path)
            break
        last_list_md5 = current_md5

        logger.info(f"列表页截图已完成: {screenshot_path}")

        # 2. 执行本地图形学分析，生成 _vl.jpg 和 .json
        logger.info("正在执行图形学切片分析...")
        json_metadata = Kit.crop_cards_from_image(screenshot_path, save_debug=True)

        vl_img_path = screenshot_path.replace(".jpg", "_vl.jpg")

        if not json_metadata or not json_metadata.get("cards"):
            logger.warning("未识别到任何卡片，跳过当前页")
            continue

        # 3. 后台异步上传 _vl.jpg 供以后参考 (如果需要，目前仅本地处理)

        # 4. 调用 VL 模型识别场站信息 (整页识别)
        logger.info("正在调用 VL 模型识别场站信息 (整页识别)...")
        # 优先使用带绿框的图片进行识别
        img_to_process = vl_img_path if os.path.exists(vl_img_path) else screenshot_path
        stations = await service.process_station_list_vl(img_to_process, json_metadata, device_info=device_info, max_count=max_stations_count - total_encountered_count)
        logger.info(f"本页识别到 {len(stations)} 个有效场站")

        ad_top_y_norm = 0.78 # 默认点击边界

        # 5. 遍历处理本页所有场站
        new_stations_processed_in_page = 0
        if json_metadata.get("cards") and stations:
            for card_idx, card in enumerate(json_metadata["cards"]):
                # 检查是否已达到最大采集数量
                if total_encountered_count >= max_stations_count:
                    break

                # 检查索引是否越界 (VL 模型可能返回的数组长度不一致)
                if card_idx < len(stations) and stations[card_idx]:
                    station = stations[card_idx]
                    station_name = station.get("station_name")
                    if not station_name: continue

                    # 只要是有效的场站，就计入已遇到数量
                    total_encountered_count += 1

                    # 检查 Redis 去重
                    if await is_already_crawled(redis_kit, station_name):
                        logger.info(f"场站 {station_name} 匹配到已处理记录，跳过。({total_encountered_count}/{max_stations_count})")
                        continue

                    # 检查是否被遮挡或太靠近底部
                    card_bottom = card["bounds_norm"]["bottom"]
                    if card_bottom > ad_top_y_norm:
                        logger.warning(f"场站 '{station_name}' 被遮挡或太靠近底部 (Bottom {card_bottom:.2f} > {ad_top_y_norm})，留待下次滚动处理。")
                        # 还没处理，不能算作已遇到
                        total_encountered_count -= 1
                        continue

                    # 正常处理新场站
                    click_x, click_y = card["click_point"]
                    logger.info(f">>> 发现新场站 '{station_name}'，开始处理... ({total_encountered_count}/{max_stations_count})")
                    new_stations_processed_in_page += 1
                    total_new_processed_count += 1
                    d.click(click_x, click_y)

                    # 等待二级页面加载
                    await asyncio.sleep(3)

                    should_back_to_list = True

                    # 截取二级页面图
                    detail_uuid = f"detail_{station_name}_{image_uuid}"
                    detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)

                    # 后台分析详情页基础数据
                    logger.info(f"已启动后台分析详情页: {station_name}")

                    async def detail_task_wrapper(p, name):
                        try:
                            await service.process_station_detail(p, station_name=name)
                            if os.path.exists(p): os.remove(p)
                        except Exception as ex:
                            logger.error(f"详情页分析异常 ({name}): {ex}")

                    task_detail = asyncio.create_task(detail_task_wrapper(detail_path, station_name))
                    background_tasks.append(task_detail)

                    # 4. 寻找分时价格入口并进入三级页面
                    # 【优化】梯级识别策略：优先模板匹配 (timePrice.jpg)，失败则降级为 VL 识别
                    template_time_price = os.path.join(os.path.dirname(__file__), "Template", "timePrice.jpg")

                    # 记录点击前的页面特征，用于验证是否成功进入三级页面
                    before_click_md5 = Kit.get_image_content_md5(detail_path)

                    entered_price_page = False
                    click_pos = None

                    try:
                        # 1. 尝试模板匹配
                        if os.path.exists(template_time_price):
                            match_res = d.image.match(template_time_price)
                            if match_res:
                                # 兼容 uiautomator2 不同版本的返回结果 (Match对象或dict)
                                if hasattr(match_res, 'point') and match_res.point:
                                    click_pos = match_res.point
                                elif isinstance(match_res, dict):
                                    if 'point' in match_res and match_res['point']:
                                        click_pos = match_res['point']
                                    elif 'x' in match_res and 'y' in match_res:
                                        click_pos = (match_res['x'], match_res['y'])
                                elif isinstance(match_res, (list, tuple)) and len(match_res) >= 2:
                                    click_pos = (match_res[0], match_res[1])

                                if click_pos:
                                    logger.info(f"通过 timePrice.jpg 成功找到！坐标: {click_pos}")

                        # 如果模板匹配未成功解析坐标，或者匹配直接失败，则降级到 VL
                        if not click_pos:
                            logger.info("模板匹配未找到或解析失败，降级使用 VL 识别...")
                            res = await ReadImageKit.find_time_price_button_coordinate(detail_path, device_info=device_info)
                            if res.get("uia_center_x"):
                                click_pos = (res.get("uia_center_x"), res.get("uia_center_y"))
                                logger.info(f"VL 识别成功找到 '分时价格' 按钮: {click_pos}")

                        if click_pos:
                            d.click(click_pos[0], click_pos[1])
                            await asyncio.sleep(2)

                            # 检查页面是否真的变化了
                            after_click_path = take_screenshot(d, f"check_{detail_uuid}", save_dir=TEMP_IMAGE_DIR)
                            after_click_md5 = Kit.get_image_content_md5(after_click_path)

                            if before_click_md5 != after_click_md5:
                                entered_price_page = True
                                # 抓取三级页面图片
                                price_screenshots = []
                                price_screenshots.append(after_click_path)

                                # 滑动处理
                                last_price_md5 = after_click_md5
                                for scroll_idx in range(2):
                                    d.swipe(w // 2, int(h * 0.7), w // 2, int(h * 0.3), duration=0.4)
                                    await asyncio.sleep(1)
                                    scroll_path = take_screenshot(d, f"price_scroll_{scroll_idx}_{station_name}", save_dir=TEMP_IMAGE_DIR)
                                    curr_price_md5 = Kit.get_image_content_md5(
                                        scroll_path,
                                        top_ratio=SAFE_EXCLUDE_RATIO,
                                        bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO
                                    )
                                    if curr_price_md5 == last_price_md5: break
                                    price_screenshots.append(scroll_path)
                                    last_price_md5 = curr_price_md5

                                # 后台处理价格图片
                                logger.info(f"已启动后台分析价格页: {station_name}，共 {len(price_screenshots)} 张图")
                                task_price = asyncio.create_task(analyze_prices_background(service, station_name, price_screenshots))
                                background_tasks.append(task_price)

                                # 立即返回
                                d.press("back")
                                await asyncio.sleep(0.5)

                                # 检查返回后的页面状态
                                check_back_path = take_screenshot(d, f"check_back_{detail_uuid}", save_dir=TEMP_IMAGE_DIR)

                                check_list_md5 = Kit.get_image_content_md5(
                                    check_back_path,
                                    top_ratio=SAFE_EXCLUDE_RATIO,
                                    bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO
                                )
                                check_detail_md5 = Kit.get_image_content_md5(check_back_path)

                                if os.path.exists(check_back_path): os.remove(check_back_path)

                                if check_list_md5 == current_md5:
                                    logger.info("检测到已直接返回列表页，跳过后续返回操作。")
                                    should_back_to_list = False
                                elif check_detail_md5 == before_click_md5:
                                    logger.info("检测到返回详情页，需执行第二次返回。")

                    except Exception as e:
                        logger.error(f"处理三级页面详情异常 ({station_name}): {e}")

                    # 从二级页面返回列表页
                    if should_back_to_list:
                        d.press("back")
                        await asyncio.sleep(0.8)

                    # 记录 Redis
                    full_name_key = f"crawled:aite:{Kit.clean_station_name(station_name)}"
                    await redis_kit.set_data(full_name_key, "1", expire=REDIS_STATION_EXPIRE)

            # 如果内层循环已达上限，外层循环也应退出，避免不必要的滑动
            if total_encountered_count >= max_stations_count:
                break

        # 清理已完成的后台任务
        done_tasks = [t for t in background_tasks if t.done()]
        for t in done_tasks:
            if t.exception():
                logger.error(f"后台任务异常: {t.exception()}")
            background_tasks.remove(t)

        if stations and new_stations_processed_in_page == 0:
            no_new_data_count += 1
            logger.info(f"本页所有场站均已处理过，连续 {no_new_data_count} 页无新数据。")
            if no_new_data_count >= 5:
                logger.info("连续 5 页无新数据，判定为已到底，提前结束。")
                break
        else:
            no_new_data_count = 0

        # 5. 列表页翻页滑动
        logger.info("执行翻页滑动...")
        start_x, start_y = w // 2, int(h * 0.8)
        end_x, end_y = w // 2, int(h * (0.8 - SCROLL_DISTANCE_RATIO))
        d.swipe(start_x, start_y, end_x, end_y, duration=0.5)
        await asyncio.sleep(WAIT_AFTER_SCROLL)

    # 最终等待所有后台分析任务完成
    if background_tasks:
        logger.info(f"正在等待剩余 {len(background_tasks)} 个后台分析任务完成...")
        await asyncio.gather(*background_tasks, return_exceptions=True)

    logger.info(f"采集任务完成，共遇到 {total_encountered_count} 个场站，新处理 {total_new_processed_count} 个。")
    return total_new_processed_count

async def analyze_prices_background(service, station_name, image_paths):
    """
    后台异步处理价格图片分析
    """
    try:
        all_prices = []
        for path in image_paths:
            prices = await ReadImageKit.get_price_detail_from_image(path)
            if prices and isinstance(prices, list):
                for p in prices:
                    if not isinstance(p, dict): continue
                    if not any(isinstance(np_i, dict) and np_i.get("start") == p.get("start") for np_i in all_prices):
                        all_prices.append(p)

        if all_prices:
            all_prices = [p for p in all_prices if isinstance(p, dict)]
            all_prices.sort(key=lambda x: x.get("start", "00:00"))
            hourly_schedule = ReadImageKit.expand_schedule_to_24h(all_prices)
            await service.process_price_detail_data(station_name, hourly_schedule)
            logger.info(f"后台处理完成: {station_name} 的分时价格已保存。")

        # 处理完成后清理图片，释放磁盘空间
        for path in image_paths:
            try:
                if os.path.exists(path):
                    os.remove(path)
                    logger.debug(f"已删除已处理的图片: {path}")
            except Exception as e:
                logger.warning(f"删除图片失败: {path}, {e}")

    except Exception as e:
        logger.error(f"分析价格后台任务异常 ({station_name}): {e}")
    return True


async def main(service=None, do_cleanup=True):
    """
    爬虫主入口
    """
    should_close_service = False
    if service is None:
        service = AiTeJiYiChongService()
        should_close_service = True
        await service.init_db()

    redis_kit = RedisKit()
    d = u2.connect()

    # 任务日志记录
    task_id = str(uuid.uuid4())
    start_time = datetime.now()
    operator = "艾特吉易充"

    # 记录任务开始
    try:
        await service.db.save("t_crawl_task_log", {
            "task_id": task_id,
            "operator": operator,
            "start_time": start_time,
            "status": "running"
        }, "task_id")
    except Exception as e:
        logger.error(f"无法记录任务开始日志: {e}")

    total_count = 0
    status = "success"
    error_msg = None

    try:
        if do_cleanup:
            # 清理临时目录
            Kit.clear_temp_dir()

            # [Testing] 如果配置为测试模式，启动时清除 Redis 记录
            await clean_redis_data(redis_kit)

        total_count = await get_station_list(d, service, max_stations_count=MAX_STATIONS_COUNT)

        if total_count:
            logger.info("场站列表采集完成。")
        else:
            logger.warning("未采集到任何场站信息。")

        return True
    except Exception as e:
        logger.error(f"爬取过程中出现异常: {e}")
        status = "failed"
        error_msg = str(e)
        return False
    finally:
        # 记录任务结束
        end_time = datetime.now()
        duration = int((end_time - start_time).total_seconds())
        try:
            await service.db.update("t_crawl_task_log", {
                "task_id": task_id,
                "end_time": end_time,
                "duration_seconds": duration,
                "station_count": total_count,
                "status": status,
                "error_msg": error_msg
            }, "task_id")
            logger.info(f"任务执行日志已更新: {operator} | 耗时: {duration}s | 数量: {total_count} | 状态: {status}")
        except Exception as e:
            logger.error(f"无法更新任务执行日志: {e}")

        # 如果是内部初始化的 service，则在此关闭
        if should_close_service:
            await service.close_db()

if __name__ == "__main__":
    asyncio.run(main())