Files
aiData/Apps/YeLiTe/Crawler.py
HuangHai 78f116ab84 'commit'
2026-01-16 19:30:31 +08:00

378 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import logging
import uuid
import os
import sys
import json
import time
from datetime import datetime
from Apps.YeLiTe.Kit import take_screenshot, clean_station_name, get_image_content_md5, detect_price_info_container_cv
from Apps.YeLiTe.ReadImageKit import ReadImageKit
from Apps.YeLiTe.Service import YiLaiTeService
from Apps.YeLiTe.Config.Setting import (
SCROLL_DISTANCE_RATIO, WAIT_AFTER_SCROLL, MAX_STATIONS_COUNT,
WAIT_DETAIL_PAGE_LOAD, WAIT_BACK_TO_LIST, TEST_CLEAR_REDIS,
SAFE_EXCLUDE_RATIO, BOTTOM_SAFE_EXCLUDE_RATIO,
FIRST_RUN_ONLY_ONE_STATION
)
from Util.RedisKit import RedisKit
from Core.BaseCrawler import BaseCrawler
import uiautomator2 as u2
# 项目根目录处理
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
sys.path.append(project_root)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("YiLaiTeCrawler")
class YiLaiTeCrawler(BaseCrawler):
def __init__(self, service=None):
super().__init__(service or YiLaiTeService())
self.read_image_kit = ReadImageKit()
self.redis_kit = RedisKit()
async def start(self):
"""
实现 BaseCrawler 的启动入口
"""
await main(self.service)
async def open_app(self):
"""
打开驿来特小程序
"""
from Apps.YeLiTe import Opener
return await Opener.open_mini_program()
async def crawl_list(self):
"""
开始爬取列表页
"""
d = u2.connect()
return await self.crawl_list_logic(d)
async def crawl_detail(self, station_info):
"""
爬取详情页 (BaseCrawler 要求,此处逻辑已集成在 crawl_list_logic 中)
"""
pass
async def clean_redis_data(self):
"""
清除测试用的 Redis 记录
"""
if TEST_CLEAR_REDIS:
logger.info("清理 Redis 中的场站处理记录...")
pattern = "crawled:ylt:*"
keys = await self.redis_kit.keys(pattern)
if keys:
await self.redis_kit.delete(*keys)
async def crawl_list_logic(self, d):
w, h = d.window_size()
max_to_crawl = 1 if FIRST_RUN_ONLY_ONE_STATION else MAX_STATIONS_COUNT
processed_count = 0
no_new_data_count = 0
last_md5 = None
background_tasks = []
while processed_count < max_to_crawl:
# 1. 截图并分析
screenshot_path = take_screenshot(d, f"list_{int(time.time())}.jpg")
# 检测是否滚动到底部
curr_md5 = get_image_content_md5(screenshot_path, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
if last_md5 == curr_md5:
logger.info("内容无变化,判定已到底部")
if os.path.exists(screenshot_path): os.remove(screenshot_path)
break
last_md5 = curr_md5
stations = await self.read_image_kit.analyze_station_list(screenshot_path)
if not stations:
no_new_data_count += 1
if no_new_data_count >= 5:
logger.info("连续 5 页无新数据,停止")
break
# 滑动
d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
await asyncio.sleep(WAIT_AFTER_SCROLL)
continue
no_new_data_count = 0
new_stations_in_page = 0
for station in stations:
if processed_count >= max_to_crawl:
break
name = station.get('name')
address = station.get('address')
distance = station.get('distance')
point = station.get('point')
if not name or not point:
continue
# 去重
redis_key = f"crawled:ylt:{clean_station_name(name)}"
if await self.redis_kit.get_data(redis_key):
logger.info(f"场站 {name} 已处理过,跳过")
continue
current_idx = processed_count + 1
remaining = max_to_crawl - current_idx
logger.info(f"--- [进度: {current_idx}/{max_to_crawl}, 剩余: {remaining}] 发现新场站: {name} (坐标: {point}, 距离: {distance}) ---")
# 点击进入前截图,用于对比是否成功进入二级页
before_click_path = take_screenshot(d, f"before_{clean_station_name(name)}")
before_md5 = get_image_content_md5(before_click_path, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
# 稍微等待一下,确保 UI 稳定
await asyncio.sleep(0.5)
# 使用 input tap 替代 d.click提高点击成功率 (部分小程序对 click 响应不佳)
d.shell(f"input tap {int(point[0])} {int(point[1])}")
await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)
# 分析详情页 (采用异步后台模式)
detail_shot = take_screenshot(d, f"detail_{clean_station_name(name)}_{int(time.time())}")
after_md5 = get_image_content_md5(detail_shot, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
# 清理临时对比图
if os.path.exists(before_click_path): os.remove(before_click_path)
if before_md5 == after_md5:
logger.warning(f"首次点击 {name} 未跳转,尝试稍微偏移位置重试...")
# 尝试向下偏移 20px 点击
offset_y = point[1] + 20
d.shell(f"input tap {int(point[0])} {int(offset_y)}")
await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)
# 再次截图检查
if os.path.exists(detail_shot): os.remove(detail_shot)
detail_shot = take_screenshot(d, f"detail_{clean_station_name(name)}_{int(time.time())}")
after_md5 = get_image_content_md5(detail_shot, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
if before_md5 != after_md5:
logger.info(f"成功进入详情页: {name}")
# 兜底:在进入详情页后,先基于列表页信息一次性写入 Profile 与 Status
try:
await self.service.save_station_profile_and_status(
station_name=name,
address=address,
total_piles=station.get("total_piles"),
free_piles=station.get("free_piles"),
piles_detail=None,
parking_info=None,
distance=distance
)
except Exception as e:
logger.warning(f"兜底写入场站基础信息失败: {name}, {e}")
# --- 新增:点击“阶段性电价”按钮以获取完整电价列表 ---
# 使用 OCR 探测价格入口
dqdf_pos = detect_price_info_container_cv(detail_shot)
detail_shots = []
if dqdf_pos:
logger.info(f"发现价格入口按钮 (阶段性电价/当前电费) {dqdf_pos},点击进入...")
d.click(dqdf_pos[0], dqdf_pos[1])
await asyncio.sleep(2) # 等待列表加载
# 删除旧的详情页截图
if os.path.exists(detail_shot): os.remove(detail_shot)
# 1. 向下滚动到底 (根据用户反馈只有不断向下滚动才能看到00点的)
logger.info("正在向下滚动价格列表到底部 (快速多次滚动以尽快看到 00:00)...")
max_scroll_down = 10
for i in range(max_scroll_down):
before_scroll_path = take_screenshot(d, f"scroll_dn_{i}")
before_scroll_md5 = get_image_content_md5(before_scroll_path)
d.swipe_ext("up", scale=0.8)
await asyncio.sleep(0.5)
after_scroll_path = take_screenshot(d, f"scroll_dn_after_{i}")
after_scroll_md5 = get_image_content_md5(after_scroll_path)
# 清理临时截图
if os.path.exists(before_scroll_path): os.remove(before_scroll_path)
if os.path.exists(after_scroll_path): os.remove(after_scroll_path)
if before_scroll_md5 == after_scroll_md5:
logger.info(f"价格列表已到达底部 (滚动次数: {i})")
break
# 2. 向上滚动并逐页截图 (从底向上抓取)
logger.info("正在向上滚动价格列表并逐页截图...")
max_scroll_up = 10
for p_idx in range(1, max_scroll_up + 1):
# 截图当前页
p_shot = take_screenshot(d, f"detail_price_{clean_station_name(name)}_{int(time.time())}_{p_idx}")
# 检查是否还能向上滚动
before_up_md5 = get_image_content_md5(p_shot)
d.swipe_ext("down", scale=0.85)
await asyncio.sleep(0.5)
# 检查是否还有新内容
check_up_shot = take_screenshot(d, f"check_up_{p_idx}")
after_up_md5 = get_image_content_md5(check_up_shot)
if os.path.exists(check_up_shot): os.remove(check_up_shot)
detail_shots.append(p_shot)
if before_up_md5 == after_up_md5:
logger.info(f"价格列表已到达顶部 (共抓取页数: {p_idx})")
break
# 关闭分时段定价列表 (点击屏幕最顶部空白处)
logger.info("点击屏幕上部空白处以关闭定价列表...")
d.click(w * 0.5, h * 0.1)
await asyncio.sleep(1.0)
else:
logger.info("未发现价格入口按钮 (阶段性电价/当前电费),直接分析当前页")
detail_shots.append(detail_shot)
# --------------------------------------------------
# 启动后台任务处理详情页
task = asyncio.create_task(self.analyze_detail_background(name, detail_shots, address=address, distance=distance))
background_tasks.append(task)
processed_count += 1
new_stations_in_page += 1
await self.redis_kit.set_data(redis_key, "1", expire=86400*7)
# 返回列表页 (现在应该已经回到了详情页主界面)
d.press("back")
await asyncio.sleep(WAIT_BACK_TO_LIST)
# 再次检查是否回到了列表页,如果没回,再点一次 back
after_back_shot = take_screenshot(d, "check_back")
after_back_md5 = get_image_content_md5(after_back_shot, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
if os.path.exists(after_back_shot): os.remove(after_back_shot)
if after_back_md5 == after_md5: # 还在详情页/电费页
logger.info("似乎还在二级页面,尝试再次返回...")
d.press("back")
await asyncio.sleep(WAIT_BACK_TO_LIST)
if FIRST_RUN_ONLY_ONE_STATION:
logger.info("已完成首个场站的全流程采集,根据配置退出驿来特爬取任务。")
return processed_count
else:
logger.warning(f"点击场站 {name} 后页面似乎未跳转,跳过返回操作")
if os.path.exists(detail_shot): os.remove(detail_shot)
# 即使没进去,也记录一下,避免短时间内重复尝试
await self.redis_kit.set_data(redis_key, "1", expire=3600)
if new_stations_in_page == 0 and stations:
no_new_data_count += 1
if no_new_data_count >= 5:
logger.info("连续 5 页均无新场站,停止")
break
else:
no_new_data_count = 0
# 滑动翻页
d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
await asyncio.sleep(WAIT_AFTER_SCROLL)
# 等待后台任务完成
if background_tasks:
logger.info(f"等待 {len(background_tasks)} 个后台分析任务完成...")
await asyncio.gather(*background_tasks, return_exceptions=True)
return processed_count
async def analyze_detail_background(self, station_name, image_paths, address=None, distance=None):
"""
后台异步分析详情页 (支持多张截图合并)
"""
try:
if isinstance(image_paths, str):
image_paths = [image_paths]
logger.info(f"开始后台分析场站: {station_name} (图片数: {len(image_paths)})")
all_prices = []
for img_path in image_paths:
prices = await self.read_image_kit.analyze_detail_price(img_path)
if prices:
all_prices.extend(prices)
# 去重合并
unique_prices = []
seen_periods = set()
for p in all_prices:
start = str(p.get('start', '')).strip()
end = str(p.get('end', '')).strip()
if not start or not end:
continue
key = f"{start}-{end}"
if key not in seen_periods:
seen_periods.add(key)
unique_prices.append(p)
# 按开始时间排序
unique_prices.sort(key=lambda x: x.get('start', '00:00'))
if unique_prices:
await self.service.process_price_detail_data(station_name, unique_prices, address=address, distance=distance)
logger.info(f"场站 {station_name} 价格分析完成并入库 (记录数: {len(unique_prices)}, 地址: {address}, 距离: {distance})")
else:
logger.warning(f"场站 {station_name} 未识别到价格信息")
except Exception as e:
logger.error(f"后台分析 {station_name} 失败: {e}")
finally:
# 调试阶段暂时不删除截图,方便排查 VLM 识别失败原因
pass
# if isinstance(image_paths, list):
# for p in image_paths:
# if os.path.exists(p): os.remove(p)
# elif isinstance(image_paths, str) and os.path.exists(image_paths):
# os.remove(image_paths)
async def main(service=None):
if service is None:
service = YiLaiTeService()
await service.init_db()
crawler = YiLaiTeCrawler(service)
d = u2.connect()
# 清理 Redis
await crawler.clean_redis_data()
# 记录任务日志
task_id = str(uuid.uuid4())
await service.log_task_start(task_id)
total_count = 0
status = "success"
error_msg = None
try:
total_count = await crawler.crawl_list_logic(d)
except Exception as e:
logger.error(f"主流程执行失败: {e}")
status = "failed"
error_msg = str(e)
finally:
await service.log_task_end(task_id, total_count, status, error_msg)
# 如果是内部初始化的 service则关闭
if service and not isinstance(service, YiLaiTeService):
await service.close_db()
async def get_image_md5_async(path):
return get_image_md5(path)
if __name__ == "__main__":
asyncio.run(main())