aiData/Apps/TeLaiDian/Crawler.py

# coding=utf-8
import asyncio
import os
import sys
import time
import json
import cv2
from Apps.TeLaiDian.Kit import (
    take_screenshot, get_image_content_md5, clean_station_name,
    setup_logger, read_image, save_image, detect_warm_popup_xczs_cv
)
from Apps.TeLaiDian.ReadImageKit import ReadImageKit
from Apps.TeLaiDian.Service import TeLaiDianService
from Apps.TeLaiDian.Config.Setting import (
    SCROLL_DISTANCE_RATIO, WAIT_AFTER_SCROLL, MAX_STATIONS_COUNT,
    SAFE_EXCLUDE_RATIO, BOTTOM_SAFE_EXCLUDE_RATIO, WAIT_DETAIL_PAGE_LOAD,
    WAIT_BACK_TO_LIST, DETAIL_SCROLL_DISTANCE_RATIO, FIRST_RUN_ONLY_ONE_STATION,
    REDIS_STATION_EXPIRE
)
from Core.BaseCrawler import BaseCrawler
from Util.RedisKit import RedisKit
import uiautomator2 as u2

# 项目根目录处理
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
    sys.path.append(project_root)

# 初始化日志
logger = setup_logger("TeLaiDianCrawler")

class TeLaiDianCrawler(BaseCrawler):
    def __init__(self, service=None):
        super().__init__(service or TeLaiDianService())
        self.read_image_kit = ReadImageKit()
        self.redis_kit = RedisKit()
        self.pending_price_tasks = []
        self.vlm_concurrency = 3

    async def start(self):
        """
        实现 BaseCrawler 的启动入口
        注意：在 T4 等全流程脚本中，小程序由 Opener 提前打开，
        此处直接开始爬取逻辑。
        """
        d = u2.connect()
        await self.crawl_list_logic(d)

    async def open_app(self):
        """
        打开特来电小程序
        """
        from Apps.TeLaiDian import Opener
        return await Opener.open_mini_program()

    async def clear_ads(self, d, max_rounds=3):
        """
        清理页面上的广告弹窗，支持多轮检测
        """
        logger.info(f"--- [广告清理] 开始检测，最多尝试 {max_rounds} 轮 ---")
        for i in range(max_rounds):
            round_idx = i + 1
            logger.info(f"[广告清理] 第 {round_idx} 轮：正在截屏分析...")
            ad_screen = take_screenshot(d, f"tld_ad_check_r{round_idx}_{int(time.time())}.jpg")

            try:
                res = await self.read_image_kit.find_close_button_vlm(ad_screen)

                if res.get("has_ad") and res.get("close_point"):
                    close_point = res.get("close_point")
                    reason = res.get("reason", "未提供原因")
                    w, h = d.window_size()
                    target_x = int(close_point[0] * w / 1000)
                    target_y = int(close_point[1] * h / 1000)

                    logger.info(f"[广告清理] 第 {round_idx} 轮：VLM 发现广告！原因: {reason}")
                    logger.info(f"[广告清理] 计划点击坐标: ({target_x}, {target_y})，归一化坐标: {close_point}")

                    # 安全校验：绝对不能点击微信小程序的胶囊按钮区 (右上角)
                    if target_x > w * 0.75 and target_y < 150:
                        logger.warning(f"[广告清理] ⚠️ 拒绝点击疑似微信胶囊按钮的区域: ({target_x}, {target_y})，跳过本轮。")
                        continue

                    logger.info(f"[广告清理] 正在执行点击关闭操作...")
                    d.click(target_x, target_y)
                    # 点击后等待一下，让弹窗消失或下一轮广告弹出
                    await asyncio.sleep(2.0)
                else:
                    logger.info(f"[广告清理] 第 {round_idx} 轮：未发现广告弹窗。VLM 理由: {res.get('reason', '无')}")
                    if os.path.exists(ad_screen): os.remove(ad_screen)
                    break
            except Exception as e:
                logger.error(f"[广告清理] 第 {round_idx} 轮检测发生异常: {e}")
            finally:
                if os.path.exists(ad_screen): os.remove(ad_screen)

        logger.info("--- [广告清理] 任务结束 ---")

    async def crawl_list_logic(self, d):
        # 1. 启动即清理广告 (已根据要求关闭)
        # await self.clear_ads(d, max_rounds=3)

        # [优化] 向下滚动以刷新/校准地理位置
        # 使用更加显式的 swipe 方式：从屏幕 30% 划到 80%
        popup_screen_path = take_screenshot(d, f"tld_detail_popup_{int(time.time())}.jpg")
        logger.info(f"[详情页] 截图用于检测温馨提示弹窗: {popup_screen_path}")

        # 使用 OCR 探测“下次再说”按钮 (替代原来的模板匹配)
        ocr_res = detect_warm_popup_xczs_cv(popup_screen_path)
        ocr_point = None
        if ocr_res:
            w, h = d.window_size()
            ocr_point = (
                int((ocr_res[0] + ocr_res[2]) / 2 * w / 1000),
                int((ocr_res[1] + ocr_res[3]) / 2 * h / 1000)
            )
            logger.info(f"[详情页] OCR 检测到“下次再说”按钮位置: {ocr_point}")
        else:
            logger.info("[详情页] OCR 未能检测到“下次再说”按钮")

        vlm_popup = {"has_popup": False}
        try:
            vlm_popup = await self.read_image_kit.check_warm_popup_vlm(popup_screen_path)
        except Exception as e:
            logger.error(f"[详情页] VLM 检测温馨提示弹窗失败: {e}")

        has_vlm_popup = isinstance(vlm_popup, dict) and vlm_popup.get("has_popup")
        if ocr_point and has_vlm_popup:
            click_x, click_y = ocr_point
            logger.info(f"[详情页] OCR 与 VLM 均确认存在温馨提示弹窗，即将点击“下次再说”按钮: ({click_x}, {click_y})")
            debug_popup_path = popup_screen_path.replace(".jpg", f"_xczs_click_{click_x}_{click_y}.jpg")
            try:
                img_popup = read_image(popup_screen_path)
                if img_popup is not None:
                    cv2.circle(img_popup, (click_x, click_y), 20, (0, 0, 255), -1)
                    save_image(debug_popup_path, img_popup)
                    logger.info(f"[详情页] 已生成温馨提示弹窗点击诊断图片: {debug_popup_path}")
            except Exception as e:
                logger.error(f"[详情页] 生成温馨提示弹窗诊断图片失败: {e}")
            try:
                d.click(click_x, click_y)
                await asyncio.sleep(1.5)
            except Exception as e:
                logger.error(f"[详情页] 点击温馨提示“下次再说”失败: {e}")
        else:
            logger.info(f"[详情页] 温馨提示弹窗未通过双重确认，OCR检测: {bool(ocr_point)} | VLM 检测: {vlm_popup}")

        w, h = d.window_size()
        logger.info(f"执行显式下拉刷新操作: (x={w//2}, y1={int(h*0.3)} -> y2={int(h*0.8)})")
        d.swipe(w // 2, int(h * 0.3), w // 2, int(h * 0.8), duration=0.5)

        logger.info(f"等待 {WAIT_AFTER_SCROLL} 秒确保位置校准和列表刷新完成...")
        await asyncio.sleep(WAIT_AFTER_SCROLL)

        max_to_crawl = 1 if FIRST_RUN_ONLY_ONE_STATION else MAX_STATIONS_COUNT
        processed_count = 0
        last_md5 = None

        while processed_count < max_to_crawl:
            # 1. 截图并分析
            screenshot_path = take_screenshot(d, f"tld_list_{int(time.time())}.jpg")

            # 检测是否滚动到底部
            curr_md5 = get_image_content_md5(screenshot_path, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
            if last_md5 == curr_md5:
                logger.info("内容无变化，判定已到底部")
                if os.path.exists(screenshot_path): os.remove(screenshot_path)
                break
            last_md5 = curr_md5

            stations = await self.read_image_kit.analyze_station_list(screenshot_path)
            if not stations:
                # 检查是否意外退出了小程序
                is_wrong_page = await self.check_wrong_page(d, screenshot_path)
                if is_wrong_page:
                    logger.error("检测到已退出详情列表页（可能回到了搜索页），尝试重新进入...")
                    await self.open_app()
                    await asyncio.sleep(5)
                    continue

                logger.info("本页未检测到场站，尝试滑动...")
                d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
                await asyncio.sleep(WAIT_AFTER_SCROLL)
                continue

            for station in stations:
                if processed_count >= max_to_crawl:
                    break

                name = station.get("name")
                point = station.get("point")

                if not name or not point:
                    continue

                # [优化] 使用 Redis 进行跨运行去重
                cleaned_name = clean_station_name(name)
                redis_key = f"crawled:tld:{cleaned_name}"
                if await self.redis_kit.get_data(redis_key):
                    logger.info(f"跳过已处理场站 (Redis): {name}")
                    continue

                current_idx = processed_count + 1
                remaining = max_to_crawl - current_idx
                logger.info(f"--- [进度: {current_idx}/{max_to_crawl}, 剩余: {remaining}] 处理场站: {name} (坐标: {point}, 距离: {station.get('distance')}) ---")

                # 点击进入详情
                d.click(point[0], point[1])
                logger.info(f"已点击场站 '{name}'，等待 {WAIT_DETAIL_PAGE_LOAD}s 加载详情页...")
                await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)

                # 截图验证是否进入详情页
                detail_check_path = take_screenshot(d, f"tld_detail_check_{int(time.time())}.jpg")
                logger.info(f"详情页快照已保存: {detail_check_path}")

                # 简单验证：如果标题包含 "我的卡券"、"优惠券"、"新人福利" 等，说明点错了
                is_wrong_page = await self.check_wrong_page(d, detail_check_path)
                if is_wrong_page:
                    logger.warning(f"检测到进入了错误页面，尝试返回列表...")
                    # 尝试点击左上角的返回箭头，如果没有，则执行系统 back
                    d.click(40, 70) # 特来电通常左上角有返回箭头
                    await asyncio.sleep(1.0)
                    d.press("back")
                    await asyncio.sleep(WAIT_BACK_TO_LIST)
                    if os.path.exists(detail_check_path): os.remove(detail_check_path)
                    continue

                # 爬取详情
                await self.crawl_detail_logic(d, station)
                if os.path.exists(detail_check_path): os.remove(detail_check_path)

                # 标记为已处理
                await self.redis_kit.set_data(redis_key, "1", expire=REDIS_STATION_EXPIRE)

                d.press("back")
                await asyncio.sleep(WAIT_BACK_TO_LIST)
                d.press("back")
                await asyncio.sleep(WAIT_BACK_TO_LIST)
                processed_count += 1
                if FIRST_RUN_ONLY_ONE_STATION:
                    logger.info("已完成首个场站的全流程采集，根据配置退出爬取任务。")
                    if os.path.exists(screenshot_path):
                        os.remove(screenshot_path)
                    if self.pending_price_tasks:
                        logger.info(f"[收尾] 等待后台价格任务完成，共 {len(self.pending_price_tasks)} 个...")
                        try:
                            await asyncio.gather(*self.pending_price_tasks, return_exceptions=True)
                        finally:
                            self.pending_price_tasks.clear()
                        logger.info("[收尾] 后台价格任务已全部完成")
                    return

            # 滑动到下一页
            d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
            await asyncio.sleep(WAIT_AFTER_SCROLL)

            if os.path.exists(screenshot_path): os.remove(screenshot_path)

        if self.pending_price_tasks:
            logger.info(f"[收尾] 等待后台价格任务完成，共 {len(self.pending_price_tasks)} 个...")
            try:
                await asyncio.gather(*self.pending_price_tasks, return_exceptions=True)
            finally:
                self.pending_price_tasks.clear()
            logger.info("[收尾] 后台价格任务已全部完成")

    async def check_wrong_page(self, d, image_path):
        """
        检查是否误触进入了错误的页面（如：我的卡券、活动页等）
        """
        data = await self.read_image_kit.check_wrong_page_vlm(image_path)
        is_detail = data.get("is_detail_page", True)
        if not is_detail:
            logger.warning(f"⚠️ 确认进入错误页面: {data.get('page_type')} ({data.get('reason')})")
        return not is_detail

    async def crawl_detail_logic(self, d, station_info):
        """
        在详情页提取价格和状态信息
        """
        first_screen_path = take_screenshot(d, f"tld_detail_basic_{int(time.time())}.jpg")
        station_name = station_info.get("name")
        address = station_info.get("address")
        distance = station_info.get("distance")
        total_piles = None
        free_piles = None
        piles_detail = None
        parking_info = None
        submitted_price_task = False

        logger.info(f"[详情页] 进入 crawl_detail_logic，场站: {station_name} | 地址: {address}")
        logger.info(f"[详情页] 已截取首屏截图，准备识别基础信息: {first_screen_path}")

        try:
            basic_info = await self.read_image_kit.analyze_detail_basic_info(first_screen_path)
            if isinstance(basic_info, dict):
                if basic_info.get("name"):
                    station_name = basic_info.get("name")
                if basic_info.get("address"):
                    address = basic_info.get("address")

                # 提取电桩信息
                total_piles = basic_info.get("total_piles")
                free_piles = basic_info.get("free_piles")
                piles_detail = basic_info.get("piles_detail")
                parking_info = basic_info.get("parking_info")

            if total_piles is None and isinstance(station_info, dict):
                tp = station_info.get("total_piles")
                if tp is not None:
                    total_piles = tp
            if free_piles is None and isinstance(station_info, dict):
                fp = station_info.get("free_piles")
                if fp is not None:
                    free_piles = fp

            logger.info(f"[详情页] 基础信息识别结果: {station_name} | {address} | 桩数: {total_piles}/{free_piles} | 停车费: {parking_info}")
        except Exception as ex:
            logger.error(f"[详情页] 同步分析详情页基础信息失败: {ex}")
        finally:
            if os.path.exists(first_screen_path):
                try:
                    os.remove(first_screen_path)
                except:
                    pass

        w, h = d.window_size()

        logger.info("[详情页] 根据用户要求：进入页面后多等会，然后通过 OCR 实时探测价格入口 (全部时段/全天价格统一)")
        w, h = d.window_size()

        # 1. 增加等待时间，确保页面加载完成
        logger.info(f"[详情页] 等待 {WAIT_DETAIL_PAGE_LOAD}s 确保页面稳定...")
        await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)

        # --- [优化] 小步快跑滚动查找逻辑 ---
        # 目标：通过多次小幅度滚动 + OCR 实时探测，精准捕获价格入口（“全部时段”或“全天价格统一”），避免滑过头
        max_scroll_attempts = 6  # 最大滚动尝试次数
        scroll_step_ratio = 0.3  # 每次滚动的步长（屏幕高度的 30%）
        found_entry = None

        logger.info(f"[详情页] 开始“小步快跑”滚动查找价格入口 (全部时段/全天价格统一) (最多 {max_scroll_attempts} 次)...")

        for i in range(max_scroll_attempts):
            # 1. 截图识别当前屏
            curr_screen = take_screenshot(d, f"tld_scroll_ocr_{i}_{int(time.time())}.jpg")

            # 2. 尝试 OCR 识别价格入口
            entry_data = await self.read_image_kit.find_price_entrance_ocr(curr_screen)

            if entry_data.get("found"):
                logger.info(f"[详情页] 第 {i+1} 次尝试：成功探测到价格入口！")
                found_entry = {
                    "screen": curr_screen,
                    "point": entry_data["point"]
                }
                break

            # 3. 如果没找到，小幅向上滚动一段距离
            if i < max_scroll_attempts - 1:
                logger.info(f"[详情页] 第 {i+1} 次尝试未找到，小幅向上滚动 (步长: {scroll_step_ratio*100}%)...")
                d.swipe(w // 2, int(h * 0.7), w // 2, int(h * (0.7 - scroll_step_ratio)), duration=0.5)
                await asyncio.sleep(1.5) # 滚动后短暂停留

            # 清理过程截图
            if os.path.exists(curr_screen):
                try: os.remove(curr_screen)
                except: pass

        entrance_clicked = False
        try:
            if found_entry:
                price_tab_screen = found_entry["screen"]
                p = found_entry["point"]

                # 2. 点击价格入口
                entry_x = int(p[0] * w / 1000)
                entry_y = int(p[1] * h / 1000)

                # 安全校验
                if entry_y > h * 0.9:
                    logger.warning(f"[详情页] 入口坐标偏低 ({entry_y})，可能在底部遮罩，尝试微调。")

                # 绘制最终点击诊断图
                debug_click_path = price_tab_screen.replace(".jpg", "_final_click.jpg")
                img = read_image(price_tab_screen)
                if img is not None:
                    cv2.circle(img, (entry_x, entry_y), 25, (0, 255, 0), -1)
                    save_image(debug_click_path, img)
                    logger.info(f"[详情页] 已生成最终点击诊断图: {debug_click_path}")

                logger.info(f"[详情页] 正在点击电价入口: ({entry_x}, {entry_y})")
                d.click(entry_x, entry_y)
                entrance_clicked = True
                await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)
            else:
                # 按照用户要求：找不到文字时输出日志、截图并停止程序
                fail_screen = take_screenshot(d, f"tld_ocr_fail_{int(time.time())}.jpg")
                logger.error(f"❌ [OCR失败] 经过 {max_scroll_attempts} 次滚动仍未在页面中找到价格入口文字 (全部时段/全天价格统一)！")
                logger.error(f"❌ [OCR失败] 最终截图已保存至: {fail_screen}")
                logger.error("❌ [OCR失败] 将不进入电价页，继续以当前信息写入基础数据。")
        except Exception as e:
            logger.error(f"[详情页] 识别或点击价格入口失败: {e}")

        if entrance_clicked:
            entered_price_path = take_screenshot(d, f"tld_detail_price_after_enter_{int(time.time())}.jpg")
            logger.info(f"[电价页] 入口点击后的电价页截图已保存: {entered_price_path}")

            await asyncio.sleep(1.0)

            # 1. 向上滚动到顶部（不断下拉直到看到最上面的 00:00）
            logger.info("正在向上滚动价格列表到顶部 (快速多次滚动以尽快看到 00:00)...")
            max_scroll_up_to_top = 10
            for i in range(max_scroll_up_to_top):
                before_scroll_path = take_screenshot(d, f"scroll_up_{i}.jpg")
                before_scroll_md5 = get_image_content_md5(before_scroll_path)

                d.swipe_ext("down", scale=0.85)
                await asyncio.sleep(0.5)

                after_scroll_path = take_screenshot(d, f"scroll_up_after_{i}.jpg")
                after_scroll_md5 = get_image_content_md5(after_scroll_path)

                if os.path.exists(before_scroll_path): os.remove(before_scroll_path)
                if os.path.exists(after_scroll_path): os.remove(after_scroll_path)

                if before_scroll_md5 == after_scroll_md5:
                    logger.info(f"价格列表已到达顶部 (滚动次数: {i})")
                    break

            # 2. 从顶部开始向下逐页截图
            logger.info("正在从顶部开始向下逐页截图...")
            price_screenshots = []
            max_scroll_down_pages = 8
            for p_idx in range(1, max_scroll_down_pages + 1):
                # 截图当前页
                p_shot = take_screenshot(d, f"tld_detail_price_{int(time.time())}_{p_idx}.jpg")

                # 检查是否还能向下滚动
                before_dn_md5 = get_image_content_md5(p_shot)
                d.swipe_ext("up", scale=0.8)
                await asyncio.sleep(1.2)

                # 检查是否还有新内容
                check_dn_path = take_screenshot(d, f"check_dn_{p_idx}.jpg")
                after_dn_md5 = get_image_content_md5(check_dn_path)
                if os.path.exists(check_dn_path): os.remove(check_dn_path)

                price_screenshots.append(p_shot)

                if before_dn_md5 == after_dn_md5:
                    logger.info(f"价格列表已到达底部 (共抓取页数: {p_idx})")
                    break

            if price_screenshots:
                task = asyncio.create_task(
                    self._analyze_and_save_prices_async(
                        station_name=station_name,
                        address=address,
                        distance=distance,
                        price_screenshots=price_screenshots,
                        total_piles=total_piles,
                        free_piles=free_piles,
                        piles_detail=piles_detail,
                        parking_info=parking_info
                    )
                )
                self.pending_price_tasks.append(task)
                submitted_price_task = True
                logger.info(f"[详情页] 已后台提交 {len(price_screenshots)} 张电价截图进行识别与保存，继续后续流程不阻塞。")
        else:
            pass

        if not submitted_price_task:
            try:
                await self.service.save_station_profile_and_status(
                    station_name=station_name,
                    address=address,
                    total_piles=total_piles,
                    free_piles=free_piles,
                    piles_detail=piles_detail,
                    parking_info=parking_info,
                    distance=distance
                )
                logger.info(f"[详情页] 已基于整合信息写入基础数据: {station_name}")
            except Exception as e:
                logger.error(f"[详情页] 写入基础数据失败: {e}")

    async def crawl_list(self):
        """
        实现 BaseCrawler 的抽象方法
        """
        d = u2.connect()
        await self.crawl_list_logic(d)

    async def crawl_detail(self, station_info):
        """
        实现 BaseCrawler 的抽象方法
        """
        # 逻辑已在 crawl_list_logic 中通过 crawl_detail_logic 调用
        pass

    async def _analyze_and_save_prices_async(self, station_name, address, distance, price_screenshots, total_piles=None, free_piles=None, piles_detail=None, parking_info=None):
        all_prices = []
        sem = asyncio.Semaphore(self.vlm_concurrency)

        async def analyze_one(path):
            try:
                async with sem:
                    prices = await self.read_image_kit.analyze_detail_price_info(path)
                return path, prices
            except Exception as e:
                logger.error(f"[详情页] 异步识别价格失败 ({os.path.basename(path)}): {e}")
                return path, []

        try:
            tasks = [analyze_one(p) for p in price_screenshots]
            results = await asyncio.gather(*tasks, return_exceptions=False)

            for path, prices in results:
                if prices:
                    all_prices.extend(prices)
                if os.path.exists(path):
                    try: os.remove(path)
                    except: pass

            if not all_prices:
                logger.warning(f"[详情页] {station_name} 后台识别未提取到任何价格信息")
                return

            unique_prices = []
            seen_periods = set()
            for p in all_prices:
                key = f"{p.get('start')}-{p.get('end')}"
                if key not in seen_periods:
                    unique_prices.append(p)
                    seen_periods.add(key)

            unique_prices.sort(key=lambda x: x.get("start", "00:00"))

            await self.service.save_station_data(
                station_name=station_name,
                address=address,
                prices=unique_prices,
                total_piles=total_piles,
                free_piles=free_piles,
                piles_detail=piles_detail,
                parking_info=parking_info,
                distance=distance,
            )
            logger.info(f"[详情页] {station_name} 后台价格信息处理完成，共 {len(unique_prices)} 条时段，并已写入数据库。")
        except Exception as e:
            logger.error(f"[详情页] 后台处理价格截图失败: {e}")
        finally:
            for p_shot in price_screenshots:
                if os.path.exists(p_shot):
                    try: os.remove(p_shot)
                    except: pass

async def main(service=None):
    crawler = TeLaiDianCrawler(service=service)
    await crawler.start()

if __name__ == "__main__":
    asyncio.run(main())