265 lines
11 KiB
Python
265 lines
11 KiB
Python
# coding=utf-8
|
||
import asyncio
|
||
import logging
|
||
import uuid
|
||
import os
|
||
import sys
|
||
import time
|
||
from PIL import Image
|
||
|
||
# 将项目根目录添加到 sys.path
|
||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
if project_root not in sys.path:
|
||
sys.path.append(project_root)
|
||
|
||
import uiautomator2 as u2
|
||
from Apps.AiTeJiYiChong import Kit
|
||
from Apps.AiTeJiYiChong.Kit import take_screenshot
|
||
from Apps.AiTeJiYiChong.ReadImageKit import ReadImageKit
|
||
from Util.RedisKit import RedisKit
|
||
from Apps.AiTeJiYiChong.Service import AiTeJiYiChongService
|
||
from Config.Config import TEMP_IMAGE_DIR
|
||
from Apps.AiTeJiYiChong.Config.Setting import (
|
||
SCROLL_DISTANCE_RATIO,
|
||
MAX_SCROLLS, REDIS_STATION_EXPIRE,
|
||
WAIT_AFTER_SCROLL,
|
||
MAX_CRAWL_DISTANCE
|
||
)
|
||
|
||
logger = logging.getLogger("AiTeJiYiChongCrawler")
|
||
|
||
async def is_already_crawled(redis_kit, station_name):
|
||
"""
|
||
检查场站是否已经爬取过,仅按名称去重
|
||
"""
|
||
cleaned_name = Kit.clean_station_name(station_name)
|
||
if not cleaned_name:
|
||
return False
|
||
|
||
redis_key = f"crawled:aite:{cleaned_name}"
|
||
if await redis_kit.get_data(redis_key):
|
||
return True
|
||
return False
|
||
|
||
async def get_station_list(d, service, max_scrolls=MAX_SCROLLS):
|
||
"""
|
||
获取场站列表并处理翻页
|
||
"""
|
||
redis_kit = RedisKit()
|
||
window_size = d.window_size()
|
||
w, h = window_size[0], window_size[1]
|
||
|
||
device_info = d.info
|
||
device_info['width'] = w
|
||
device_info['height'] = h
|
||
|
||
logger.info(f"开始爬取列表,设备: {device_info.get('productName')} | 分辨率: {w}x{h}")
|
||
|
||
# 用于追踪后台分析任务
|
||
background_tasks = []
|
||
|
||
for i in range(max_scrolls + 1):
|
||
logger.info(f"正在处理第 {i + 1} 页...")
|
||
# ... (前序步骤保持不变)
|
||
|
||
# (在此处截断,以便后续替换循环体内的逻辑)
|
||
|
||
|
||
# 1. 拍摄截图
|
||
image_uuid = str(uuid.uuid4())
|
||
screenshot_path = take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR)
|
||
logger.info(f"列表页截图已完成: {screenshot_path}")
|
||
|
||
# 2. 执行图形学分析,生成 _flag.jpg, _vl.jpg 和 .json
|
||
logger.info("正在执行图形学切片分析...")
|
||
json_data = Kit.crop_cards_from_image(screenshot_path)
|
||
|
||
# 3. 调用混合模式识别 (图形学切片 + 本地 OCR)
|
||
stations = await service.process_station_list_hybrid(screenshot_path, device_info=device_info)
|
||
logger.info(f"本页识别到 {len(stations)} 个场站")
|
||
|
||
# 4. 匹配几何卡片与识别结果 (混合模式下已经包含在 stations 中,但为了兼容旧逻辑进行填充)
|
||
if json_data and json_data.get("cards") and stations:
|
||
for card in json_data["cards"]:
|
||
card_rect = card["rect"] # [x1, y1, x2, y2]
|
||
for st in stations:
|
||
st_bounds = st.get("bounds") # [x1, y1, x2, y2] (0-1000)
|
||
if not st_bounds: continue
|
||
|
||
# 转换 VL 坐标到像素坐标
|
||
st_y1_px = st_bounds[1] * h / 1000
|
||
st_y2_px = st_bounds[3] * h / 1000
|
||
|
||
# 计算 y 轴重叠
|
||
overlap = min(card_rect[3], st_y2_px) - max(card_rect[1], st_y1_px)
|
||
if overlap > 50: # 有显著重叠
|
||
card["station_name"] = st.get("station_name")
|
||
logger.info(f"匹配成功: 几何卡片 {card_rect} -> 场站 {card['station_name']}")
|
||
break
|
||
|
||
# 5. 遍历处理本页所有场站
|
||
if json_data and json_data.get("cards"):
|
||
for card_idx, card in enumerate(json_data["cards"]):
|
||
station_name = card.get("station_name")
|
||
if not station_name:
|
||
logger.warning(f"第 {card_idx + 1} 个几何卡片未匹配到场站名称,跳过。")
|
||
continue
|
||
|
||
# 检查 Redis 去重 (使用新的模糊匹配逻辑)
|
||
if await is_already_crawled(redis_kit, station_name):
|
||
logger.info(f"场站 {station_name} 匹配到已处理记录,跳过。")
|
||
continue
|
||
|
||
click_x, click_y = card["click_point"]
|
||
logger.info(f"准备处理第 {card_idx + 1} 个场站: {station_name}, 点击坐标: ({click_x}, {click_y})")
|
||
d.click(click_x, click_y)
|
||
|
||
# 等待二级页面加载
|
||
await asyncio.sleep(3)
|
||
|
||
# 截取二级页面图
|
||
detail_uuid = f"detail_{station_name}_{image_uuid}"
|
||
detail_path = take_screenshot(d, detail_uuid, save_dir=TEMP_IMAGE_DIR)
|
||
|
||
# 【异步优化】后台解析详情页基础数据,不阻塞 UI 流程
|
||
logger.info(f"已启动后台分析详情页: {station_name}")
|
||
|
||
async def detail_task_wrapper(p, name):
|
||
try:
|
||
await service.process_station_detail(p, station_name=name)
|
||
if os.path.exists(p): os.remove(p)
|
||
except Exception as ex:
|
||
logger.error(f"详情页分析异常 ({name}): {ex}")
|
||
|
||
task_detail = asyncio.create_task(detail_task_wrapper(detail_path, station_name))
|
||
background_tasks.append(task_detail)
|
||
|
||
# 6. 寻找 timePrice.jpg 图标并进入三级页面 (分时价格页)
|
||
time_price_template = os.path.join(os.path.dirname(__file__), "BiaoShi", "timePrice.jpg")
|
||
coords = Kit.find_template_coords(detail_path, time_price_template)
|
||
|
||
if coords:
|
||
cx, cy, conf = coords
|
||
logger.info(f"找到分时价格图标,进入三级页面...")
|
||
d.click(cx, cy)
|
||
await asyncio.sleep(2) # 等待加载
|
||
|
||
# 抓取三级页面图片(包括可能的滑动)
|
||
price_screenshots = []
|
||
price_detail_uuid = f"price_detail_{station_name}_{image_uuid}"
|
||
price_path = take_screenshot(d, price_detail_uuid, save_dir=TEMP_IMAGE_DIR)
|
||
price_screenshots.append(price_path)
|
||
|
||
# 如果需要滑动,先完成所有截图动作
|
||
last_md5 = Kit.get_file_md5(price_path)
|
||
for scroll_idx in range(2): # 减少滑动次数以加快速度,通常1-2次足够
|
||
d.swipe(w // 2, int(h * 0.7), w // 2, int(h * 0.3), duration=0.4)
|
||
await asyncio.sleep(1)
|
||
scroll_path = take_screenshot(d, f"price_scroll_{scroll_idx}_{station_name}", save_dir=TEMP_IMAGE_DIR)
|
||
curr_md5 = Kit.get_file_md5(scroll_path)
|
||
if curr_md5 == last_md5: break
|
||
price_screenshots.append(scroll_path)
|
||
last_md5 = curr_md5
|
||
|
||
# 【异步优化】后台处理所有价格图片,不阻塞 UI 返回
|
||
logger.info(f"已启动后台分析价格页: {station_name},共 {len(price_screenshots)} 张图")
|
||
task_price = asyncio.create_task(analyze_prices_background(service, station_name, price_screenshots))
|
||
background_tasks.append(task_price)
|
||
|
||
# 立即从三级页面返回
|
||
d.press("back")
|
||
await asyncio.sleep(0.5)
|
||
else:
|
||
logger.warning(f"场站 {station_name} 未找到分时价格入口。")
|
||
|
||
# 从二级页面返回列表页
|
||
d.press("back")
|
||
await asyncio.sleep(0.8)
|
||
|
||
# 记录 Redis (仅按名称去重,不再使用前缀索引)
|
||
full_name_key = f"crawled:aite:{Kit.clean_station_name(station_name)}"
|
||
await redis_kit.set_data(full_name_key, "1", expire=REDIS_STATION_EXPIRE)
|
||
|
||
# 在每一页结束时,清理已完成的任务,防止内存堆积
|
||
done_tasks = [t for t in background_tasks if t.done()]
|
||
for t in done_tasks:
|
||
if t.exception():
|
||
logger.error(f"后台任务执行异常: {t.exception()}")
|
||
background_tasks.remove(t)
|
||
|
||
# 7. 列表页翻页滑动
|
||
logger.info("执行翻页滑动...")
|
||
start_x, start_y = w // 2, int(h * 0.8)
|
||
end_x, end_y = w // 2, int(h * (0.8 - SCROLL_DISTANCE_RATIO))
|
||
d.swipe(start_x, start_y, end_x, end_y, duration=0.5)
|
||
await asyncio.sleep(WAIT_AFTER_SCROLL)
|
||
|
||
# 最终等待所有后台分析任务完成
|
||
if background_tasks:
|
||
logger.info(f"正在等待剩余 {len(background_tasks)} 个后台分析任务完成...")
|
||
await asyncio.gather(*background_tasks, return_exceptions=True)
|
||
|
||
logger.info("达到最大翻页次数,爬取结束。")
|
||
return True
|
||
|
||
async def analyze_prices_background(service, station_name, image_paths):
|
||
"""
|
||
后台异步处理价格图片分析
|
||
"""
|
||
try:
|
||
all_prices = []
|
||
for path in image_paths:
|
||
prices = await ReadImageKit.get_price_detail_from_image(path)
|
||
if prices and isinstance(prices, list):
|
||
for p in prices:
|
||
if not isinstance(p, dict): continue
|
||
if not any(isinstance(np_i, dict) and np_i.get("start") == p.get("start") for np_i in all_prices):
|
||
all_prices.append(p)
|
||
|
||
if all_prices:
|
||
all_prices = [p for p in all_prices if isinstance(p, dict)]
|
||
all_prices.sort(key=lambda x: x.get("start", "00:00"))
|
||
hourly_schedule = ReadImageKit.expand_schedule_to_24h(all_prices)
|
||
await service.process_price_detail_data(station_name, hourly_schedule)
|
||
logger.info(f"后台处理完成: {station_name} 的分时价格已保存。")
|
||
|
||
# 处理完成后清理图片,释放磁盘空间
|
||
for path in image_paths:
|
||
try:
|
||
if os.path.exists(path):
|
||
os.remove(path)
|
||
logger.debug(f"已删除已处理的图片: {path}")
|
||
except Exception as e:
|
||
logger.warning(f"删除图片失败: {path}, {e}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"分析价格后台任务异常 ({station_name}): {e}")
|
||
return True
|
||
|
||
|
||
async def main(service=None, do_cleanup=True):
|
||
"""
|
||
爬虫主入口
|
||
"""
|
||
if do_cleanup:
|
||
Kit.clear_temp_dir()
|
||
|
||
if service is None:
|
||
service = AiTeJiYiChongService()
|
||
await service.init_db()
|
||
|
||
d = u2.connect()
|
||
|
||
try:
|
||
await get_station_list(d, service)
|
||
return True
|
||
except Exception as e:
|
||
logger.error(f"爬取过程中出现异常: {e}")
|
||
return False
|
||
finally:
|
||
# 如果是内部初始化的 service,则在此关闭
|
||
pass
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|