aiData/Apps/XinDianTu/Crawler.py

# coding=utf-8
import asyncio
import logging
import uuid
import os
import sys
import json
import time
import hashlib
from PIL import Image

# 将项目根目录添加到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
    sys.path.append(project_root)

import uiautomator2 as u2
from Core.BaseCrawler import BaseCrawler
from Util import Kit
from Util.Kit import take_screenshot
from Util.ObsUtil import ObsUploader
from Util.RedisKit import RedisKit
from Util.PaddleOCRKit import get_ocr_kit
import cv2
from Apps.XinDianTu.Service import XinDianTuService
from Config.Config import (
    OBS_TMP_PREFIX, CDN_DOMAIN, SCROLL_DISTANCE_RATIO,
    MAX_SCROLLS, REDIS_STATION_EXPIRE,
    WAIT_DETAIL_PAGE_LOAD, WAIT_BACK_TO_LIST, WAIT_AFTER_SCROLL,
    MAX_CRAWL_DISTANCE, TEMP_IMAGE_DIR
)

# --- 用户配置区域 ---
# 是否保留截图文件（True=保留备查，False=随用随删）
KEEP_SCREENSHOTS = True

# [Testing] 是否在启动时清除 Redis 中的场站处理记录
# True: 每次运行前清除记录，方便反复测试同一个场站
# False: 生产模式，保留记录以避免重复爬取
TEST_CLEAR_REDIS = True

# 配置说明：
# SCROLL_DISTANCE_RATIO 控制翻页时的滑动距离（在 Config.py 中修改）。
# 对于华为 Mate 20x 等分辨率较低或屏幕较小的手机，如果发现翻页时跳过了某些场站，
# 请尝试减小 SCROLL_DISTANCE_RATIO（例如设置为 0.4 或 0.3）。
# 这样每次滑动的距离变短，可以确保所有场站都能被完整显示并识别。

# 配置日志输出
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger("StationList")

# 强制设置所有相关模块的日志级别为 INFO，防止被第三方库干扰
logging.getLogger("PaddleOCRKit").setLevel(logging.INFO)
logging.getLogger("Util.Kit").setLevel(logging.INFO)
logging.getLogger("OpenXinDianTu").setLevel(logging.INFO)
logging.getLogger("FullProcess").setLevel(logging.INFO)

class XinDianTuCrawler(BaseCrawler):
    """
    新电途小程序爬虫实现
    """
    def __init__(self, service=None):
        super().__init__(service)
        # 初始化配置参数
        self.max_scrolls = MAX_SCROLLS
        self.uploader = ObsUploader()
        self.redis_kit = RedisKit()

    async def start(self):
        """
        实现 BaseCrawler 的启动入口
        """
        # 兼容旧逻辑，直接调用 main
        await main(self.service)

    async def open_app(self):
        # 实际逻辑在 Opener.py，此处可作为封装层
        pass

    async def crawl_list(self):
        # 实际逻辑在 get_station_list，此处可作为封装层
        pass

    async def crawl_detail(self, station_info):
        pass

async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
    """
    获取场站列表并处理翻页
    """
    all_stations = []
    seen_names = set()
    redis_kit = RedisKit()

    # 确保在“找桩”或者首页列表界面
    window_size = d.window_size()
    w, h = window_size[0], window_size[1]

    # 获取设备信息以用于坐标计算
    device_info = d.info
    device_info['width'] = w
    device_info['height'] = h

    logger.info(f"设备信息: {device_info.get('productName')} | 窗口分辨率: {w}x{h}")

    max_distance_reached = False
    stop_reason = "max_scrolls_reached"  # 默认停止原因：达到最大翻页次数

    # 存储后台异步任务
    background_tasks = []

    for i in range(max_scrolls + 1):
        if max_distance_reached:
            logger.info("已达到最大抓取距离，停止翻页。")
            stop_reason = "max_distance_reached"
            break
        logger.info(f"正在处理第 {i + 1} 页...")

        # 1. 拍摄截图
        logger.info(f"Step [1/6] 正在拍摄列表页截图...")
        t_shot = time.time()
        image_uuid = str(uuid.uuid4())
        # 使用相对路径: 基于当前脚本目录下的 Images 文件夹
        base_dir = os.path.dirname(os.path.abspath(__file__))
        save_dir = TEMP_IMAGE_DIR
        screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir)
        logger.info(f"Step [1/6] 列表页截图已完成: {screenshot_path} (耗时: {time.time() - t_shot:.2f}s)")

        # 校验截图实际尺寸
        img_w, img_h = 0, 0
        try:
            with Image.open(screenshot_path) as img:
                img_w, img_h = img.size
                logger.info(f"截图实际尺寸: {img_w}x{img_h}")
                # 如果截图尺寸与窗口尺寸不符，更新 device_info 以适配坐标换算
                if img_w != w or img_h != h:
                    logger.warning(f"检测到截图尺寸 ({img_w}x{img_h}) 与窗口尺寸 ({w}x{h}) 不一致，将以截图尺寸为准进行坐标换算。")
                    device_info['width'] = img_w
                    device_info['height'] = img_h
        except Exception as e:
            logger.warning(f"无法获取截图尺寸信息: {e}")

        # 2. 使用 Kit.crop_cards_from_image 进行切片和生成 _vl.jpg
        logger.info("Step [2/6] 正在执行本地图形学切片分析，识别场站卡片并生成绿色方框图...")
        t_crop = time.time()
        # 这会生成 _vl.jpg 和 .json
        Kit.crop_cards_from_image(screenshot_path)
        logger.info(f"Step [2/6] 图形学切片分析完成 (耗时: {time.time() - t_crop:.2f}s)")

        # 读取生成的元数据
        json_path = screenshot_path.replace(".jpg", ".json")
        vl_img_path = screenshot_path.replace(".jpg", "_vl.jpg")

        if not os.path.exists(json_path) or not os.path.exists(vl_img_path):
            logger.error(f"❌ Step [2/6] 失败: 未生成有效的 JSON 或 VL 图片 (JSON: {os.path.exists(json_path)}, VL: {os.path.exists(vl_img_path)})")
            logger.warning("可能未识别到任何卡片，跳过当前页")
            continue

        logger.info(f"Step [3/6] 绿色方框图已生成: {vl_img_path}")

        with open(json_path, 'r', encoding='utf-8') as f:
            json_metadata = json.load(f)

        # --- New Logic: Local OCR via PaddleOCRKit ---
        logger.info("Step [4/6] 正在开始本地 OCR 识别流程，将识别出的卡片逐一交给 OCR...")
        t_ocr_total = time.time()

        stations = []
        try:
            ocr_kit = get_ocr_kit()

            # 读取原始截图进行裁剪 (比读取 _vl.jpg 更干净，没有绿框干扰)
            original_img = cv2.imread(screenshot_path)

            if "cards" in json_metadata and original_img is not None:
                h_img, w_img = original_img.shape[:2]
                num_cards = len(json_metadata['cards'])
                logger.info(f"检测到 {num_cards} 个场站卡片，准备开始逐一 OCR 识别...")

                for idx, card in enumerate(json_metadata["cards"]):
                    # card: {"id": 1, "rect": [x1, y1, x2, y2], "click_point": [cx, cy]}
                    rect = card.get("rect")
                    if not rect: continue
                    x1, y1, x2, y2 = rect

                    # 边界检查
                    x1 = max(0, min(x1, w_img))
                    x2 = max(0, min(x2, w_img))
                    y1 = max(0, min(y1, h_img))
                    y2 = max(0, min(y2, h_img))

                    if x2 <= x1 or y2 <= y1: continue

                    # 裁剪卡片
                    cropped_card = original_img[y1:y2, x1:x2]

                    # 识别
                    logger.info(f"正在识别第 {idx+1}/{num_cards} 个卡片 (区域: {rect})...")
                    t_card_start = time.time()
                    parsed_data = ocr_kit.recognize(cropped_card)
                    t_card_end = time.time()

                    # Log detailed OCR result
                    logger.info(f"卡片 {idx+1} OCR 识别耗时: {t_card_end - t_card_start:.2f}s")

                    if parsed_data and parsed_data.get("station_name"):
                         # 格式化数据以匹配原有结构
                         piles_list = parsed_data.get("piles", [])
                         piles_str_parts = []
                         for p in piles_list:
                             p_type = p.get("type", "")
                             p_free = p.get("free", 0)
                             p_total = p.get("total", 0)
                             piles_str_parts.append(f"{p_type}:{p_free}/{p_total}")

                         piles_str = " ".join(piles_str_parts)

                         station_info = {
                             "station_name": parsed_data.get("station_name"),
                             "price": str(parsed_data.get("price")) if parsed_data.get("price") is not None else "",
                             "pro_price": "", # 暂未解析
                             "piles": piles_str,
                             "distance": parsed_data.get("distance", ""),
                             "uia_center_x": card["click_point"][0],
                             "uia_center_y": card["click_point"][1],
                             "tags": parsed_data.get("tags", []),
                             "parking_info": parsed_data.get("parking", "")
                         }
                         logger.info(f"✅ 成功解析场站: {station_info['station_name']} | 距离: {station_info['distance']}")

                         # 立即记录场站基础状态信息 (电桩、价格等)
                         try:
                             await service.record_station_status(station_info)
                         except Exception as e:
                             logger.error(f"❌ 记录场站状态失败: {e}")

                         stations.append(station_info)
                    else:
                        logger.warning(f"⚠️ 卡片 {idx+1} 未能识别出有效场站名称")

        except Exception as e:
            logger.error(f"❌ Local OCR processing failed: {e}", exc_info=True)

        logger.info(f"Step [5/6] 本地 OCR 解析完成 (总耗时: {time.time() - t_ocr_total:.2f}s) | 成功识别: {len(stations)}/{len(json_metadata.get('cards', []))}")

        if not stations:
            logger.warning("当前页面未检测到任何有效的场站信息")
        else:
            logger.info(f"Step [6/6] 准备开始逐一点击场站进入详情页 (共 {len(stations)} 个)...")
            new_stations_found = False
            for s_idx, s in enumerate(stations):
                name = s.get("station_name")

                # 过滤掉明显的非场站名称
                if not name:
                    continue

                # --- 短期去重：检查 Redis 缓存 ---
                # 2分钟内如果处理过该场站，则跳过
                redis_key = f"processed_station:{name}"
                if await redis_kit.exists(redis_key):
                    logger.info(f"场站 {name} 在 2 分钟内已处理过，跳过")
                    continue

                if name in seen_names:
                    continue

                # 使用从 JSON 元数据中恢复的点击坐标
                x = s.get("uia_center_x")
                y = s.get("uia_center_y")

                if x is None or y is None:
                    logger.warning(f"场站 {name} 缺少坐标信息，无法进入详情页")
                    continue

                seen_names.add(name)
                all_stations.append(s)
                new_stations_found = True

                # 打印基本信息
                price = s.get("price")
                pro_price = s.get("pro_price")
                piles_str = s.get("piles", "")
                distance_str = s.get("distance", "")

                # 检查距离
                dist_log_part = ""
                if distance_str:
                    try:
                        import re
                        dist_match = re.search(r"(\d+(\.\d+)?)", distance_str)
                        if dist_match:
                            dist_val = float(dist_match.group(1))
                            is_km = "km" in distance_str.lower()
                            dist_km = dist_val if is_km else dist_val / 1000.0

                            # 将距离解析结果和限制检查合并到日志中
                            dist_log_part = f" (解析: {dist_km:.2f}km / 限: {MAX_CRAWL_DISTANCE}km)"

                            if dist_km >= MAX_CRAWL_DISTANCE:
                                logger.info(f"当前场站 {name} 距离 {dist_km}km (原始: {distance_str}) 已达到或超过限制 {MAX_CRAWL_DISTANCE}km。")
                                max_distance_reached = True
                    except Exception as e:
                        logger.warning(f"解析距离出错: {distance_str}, {e}")

                pro_info = f" | PRO会员价: {pro_price}" if pro_price else ""
                # 确保距离始终显示，如果为空则显示 "未知"
                display_distance = distance_str if distance_str else "未知"
                logger.info(f"正在处理第 {s_idx+1} 个场站: {name} | 价格: {price}{pro_info} | 枪: {piles_str} | 距离: {display_distance}{dist_log_part}")

                if max_distance_reached:
                    break

                # --- 点击进入详情页记录地址 ---
                # 使用相对坐标点击，以适应不同分辨率
                if img_w > 0 and img_h > 0:
                    rel_x = max(0.0, min(1.0, x / img_w))
                    rel_y = max(0.0, min(1.0, y / img_h))
                    logger.info(f"👉 正在点击进入详情页: {name} (相对坐标: {rel_x:.3f}, {rel_y:.3f})")
                    d.click(rel_x, rel_y)
                else:
                    logger.info(f"👉 正在点击进入详情页: {name} (绝对坐标: {x}, {y})")
                    d.click(x, y)

                await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD) # 等待详情页加载

                # 尝试获取并保存地址
                address_recorded = False
                for retry in range(2):
                    # 1. 拍摄详情页截图
                    t_d_shot = time.time()
                    detail_uuid = str(uuid.uuid4())
                    detail_screenshot_path = take_screenshot(d, detail_uuid, save_dir=save_dir)
                    logger.info(f"Step [详情页截图] 耗时: {time.time() - t_d_shot:.4f}s")

                    # 2. 上传至 OBS
                    t_d_up = time.time()
                    detail_object_key = f"{OBS_TMP_PREFIX}/{detail_uuid}.jpg"
                    up_success, _ = uploader.upload_file(detail_object_key, detail_screenshot_path)
                    logger.info(f"Step [上传详情页截图] 耗时: {time.time() - t_d_up:.4f}s")

                    if up_success:
                        detail_url = f"https://{CDN_DOMAIN}/{detail_object_key}"
                        # 3. 解析并保存地址 (异步后台处理)
                    # 定义异步任务函数
                    async def _process_address_task(task_name, task_url, task_dev_info, task_redis_key):
                        t_addr_task = time.time()
                        try:
                            addr_res = await service.process_station_address(task_name, task_url, device_info=task_dev_info)
                            logger.info(f"Step [解析并保存地址-后台] 耗时: {time.time() - t_addr_task:.4f}s")
                            if addr_res.get("address"):
                                logger.info(f"成功记录地址: {addr_res.get('address')}")
                                # 记录到 Redis，设置过期时间
                                await redis_kit.set_data(task_redis_key, "1", expire=REDIS_STATION_EXPIRE)
                        except Exception as e:
                            logger.error(f"后台处理地址失败 ({task_name}): {e}")

                    # 创建并启动任务
                    addr_task = asyncio.create_task(_process_address_task(name, detail_url, device_info, redis_key))
                    background_tasks.append(addr_task)
                    logger.info(f"已提交地址解析任务到后台: {name}")

                    # 标记为已记录以跳出 retry 循环
                    address_recorded = True

                    # 清理详情页临时截图
                    if not KEEP_SCREENSHOTS and os.path.exists(detail_screenshot_path):
                        os.remove(detail_screenshot_path)

                    break # 直接跳出 retry 循环

                # --- 抓取价格时段信息 (New Feature) ---
                # 仅在测试模式下或特定需求下启用
                try:
                    logger.info(f"开始抓取价格时段信息: {name}")

                    # 使用几何特征识别 "全部时段" 按钮
                    # 临时截图
                    temp_uuid = "temp_find_expand"
                    screenshot_path = take_screenshot(d, temp_uuid, save_dir=TEMP_IMAGE_DIR)

                    # 尝试识别，将调试图片保存到 Images 目录
                    t_find = time.time()
                    pos = Kit.find_expand_button_position(screenshot_path, debug_dir=save_dir, debug_filename_prefix=name)
                    logger.info(f"Step [识别展开按钮] 耗时: {time.time() - t_find:.4f}s")

                    # 清理截图
                    try:
                        os.remove(screenshot_path)
                    except:
                        pass

                    if pos:
                        x, y = pos
                        logger.info(f"通过几何特征找到 '全部时段' 按钮: ({x}, {y})")
                        d.click(x, y)
                        await asyncio.sleep(1.5)

                        # 抓取3屏截图
                        price_image_urls = []
                        for p_idx in range(1, 4):
                            # 使用场站名称的MD5值作为文件名前缀，避免中文和特殊字符导致的URL问题
                            t_p_loop = time.time()
                            name_md5 = hashlib.md5(name.encode('utf-8')).hexdigest()
                            p_uuid = f"{name_md5}_price_{p_idx}"
                            p_path = take_screenshot(d, p_uuid, save_dir=save_dir)
                            logger.info(f"已保存价格时段截图 {p_idx} (Station: {name}, MD5: {name_md5}): {p_path}")

                            # 上传到 OBS
                            p_object_key = f"{OBS_TMP_PREFIX}/{p_uuid}.jpg"
                            success, _ = uploader.upload_file(p_object_key, p_path)
                            if success:
                                p_url = f"https://{CDN_DOMAIN}/{p_object_key}"
                                price_image_urls.append(p_url)
                                logger.info(f"截图上传成功: {p_url}")
                            else:
                                logger.warning(f"截图上传失败: {p_path}")

                            logger.info(f"Step [价格截图与上传-{p_idx}] 耗时: {time.time() - t_p_loop:.4f}s")

                            if p_idx < 3:
                                d.swipe_ext("up", scale=0.7)
                                await asyncio.sleep(1.0)

                        # 调用服务解析价格时段 (异步后台处理)
                        if price_image_urls:
                            logger.info(f"正在调用接口解析电费时段数据 ({len(price_image_urls)} 张图片) - 转入后台...")

                            async def _process_price_task(task_name, task_urls, task_dev_info):
                                t_price_task = time.time()
                                try:
                                    await service.process_price_schedule(task_name, task_urls, device_info=task_dev_info)
                                    logger.info(f"Step [解析电费时段数据-后台] 耗时: {time.time() - t_price_task:.4f}s")
                                    logger.info(f"电费时段数据解析完成: {task_name}")
                                except Exception as e:
                                    logger.error(f"后台处理价格时段失败 ({task_name}): {e}")

                            price_task = asyncio.create_task(_process_price_task(name, price_image_urls, device_info))
                            background_tasks.append(price_task)
                        else:
                            logger.warning("未能获取到有效的价格时段截图，跳过解析")
                    else:
                        logger.warning(f"未能通过几何特征识别 '全部时段' 按钮")

                except Exception as e:
                    logger.warning(f"抓取价格时段信息时发生异常: {e}")

                # 测试模式：仅处理第一个场站
                # logger.info("测试模式：仅处理第一个场站，即将退出 (Y3)")
                # return all_stations

                # 返回列表页
                d.press("back")
                await asyncio.sleep(WAIT_BACK_TO_LIST) # 等待列表页重新稳定

                if not new_stations_found and i > 0:
                    logger.info("未发现更多新场站，停止下拉。")
                    # 如果需要强行爬取所有，可以注释掉 break
                    # break

        # 5. 如果还没到最后一页，执行下拉
        if i < max_scrolls and not max_distance_reached:
            logger.info(f"执行下拉翻页 (距离比例: {SCROLL_DISTANCE_RATIO})...")
            # 根据配置的比例计算滑动起始和终点
            # 保证滑动在屏幕中心区域进行
            start_y = 0.5 + (SCROLL_DISTANCE_RATIO / 2)
            end_y = 0.5 - (SCROLL_DISTANCE_RATIO / 2)
            d.swipe(w * 0.5, h * start_y, w * 0.5, h * end_y, duration=0.5)
            await asyncio.sleep(WAIT_AFTER_SCROLL) # 等待页面加载和列表稳定

        # 清理列表页截图
        if not KEEP_SCREENSHOTS:
            if os.path.exists(screenshot_path):
               os.remove(screenshot_path)
            if os.path.exists(vl_img_path):
               os.remove(vl_img_path)
            if os.path.exists(json_path):
               os.remove(json_path)

    logger.info(f"爬取任务结束，正在等待 {len(background_tasks)} 个后台处理任务完成...")
    if background_tasks:
        await asyncio.gather(*background_tasks)
    logger.info("所有后台任务已完成。")

    logger.info(f"任务结束，共采集到 {len(all_stations)} 个场站。停止原因: {stop_reason}")
    return all_stations

def clean_images_dir():
    """清理 Images 目录下除保留文件外的所有文件"""
    base_dir = os.path.dirname(os.path.abspath(__file__))
    images_dir = os.path.join(base_dir, "Images")

    if not os.path.exists(images_dir):
        return

    keep_files = {'1.jpg', '2.jpg', '3.jpg', '4.jpg'}
    logger.info(f"正在清理 Images 目录 (保留: {keep_files})...")

    deleted_count = 0
    for filename in os.listdir(images_dir):
        if filename in keep_files:
            continue

        file_path = os.path.join(images_dir, filename)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
                deleted_count += 1
        except Exception as e:
            logger.warning(f"删除文件失败 {filename}: {e}")

    logger.info(f"清理完成，共删除 {deleted_count} 个文件。")

async def clean_redis_data(redis_kit):
    """清除 Redis 中的场站处理记录"""
    if TEST_CLEAR_REDIS:
        logger.info("测试模式开启：正在清除 Redis 中的场站处理记录 (processed_station:*)...")
        keys = await redis_kit.keys("processed_station:*")
        if keys:
            count = await redis_kit.delete(*keys)
            logger.info(f"已清除 {count} 条场站处理记录")
        else:
            logger.info("未发现旧的场站处理记录")

async def main(service=None, do_cleanup=True):
    # 连接设备
    d = u2.connect()

    # 初始化服务
    should_close_service = False
    if service is None:
        service = XinDianTuService()
        should_close_service = True
        # 初始化数据库连接（XinDianTuService 需要）
        await service.init_db()

    uploader = ObsUploader()
    redis_kit = RedisKit()

    try:
        if do_cleanup:
            # 清理图片目录
            clean_images_dir()

            # [Testing] 如果配置为测试模式，启动时清除 Redis 记录
            await clean_redis_data(redis_kit)

        # 清理过期数据
        # await service.cleanup_old_data()

        # 获取场站列表
        stations = await get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS)

        if stations:
            logger.info("场站列表采集完成。")
        else:
            logger.warning("未采集到任何场站信息。")

        return True

    except Exception as e:
        logger.exception(f"运行过程中出现异常: {e}")
        return False
    finally:
        # 关闭数据库连接
        if should_close_service:
            await service.close_db()

if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        logger.info("程序被用户中断.")
    except Exception as e:
        logger.exception(f"程序崩溃: {e}")