# coding=utf-8 import asyncio import logging import uuid import os import sys import json import time import hashlib from PIL import Image # 将项目根目录添加到 sys.path project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if project_root not in sys.path: sys.path.append(project_root) import uiautomator2 as u2 from Core.BaseCrawler import BaseCrawler from Util import Kit from Util.Kit import take_screenshot from Util.ObsUtil import ObsUploader from Util.RedisKit import RedisKit from Util.PaddleOCRKit import get_ocr_kit import cv2 from Apps.XinDianTu.Service import XinDianTuService from Config.Config import ( OBS_TMP_PREFIX, CDN_DOMAIN, SCROLL_DISTANCE_RATIO, MAX_SCROLLS, REDIS_STATION_EXPIRE, WAIT_DETAIL_PAGE_LOAD, WAIT_BACK_TO_LIST, WAIT_AFTER_SCROLL, MAX_CRAWL_DISTANCE, TEMP_IMAGE_DIR ) # --- 用户配置区域 --- # 是否保留截图文件(True=保留备查,False=随用随删) KEEP_SCREENSHOTS = True # [Testing] 是否在启动时清除 Redis 中的场站处理记录 # True: 每次运行前清除记录,方便反复测试同一个场站 # False: 生产模式,保留记录以避免重复爬取 TEST_CLEAR_REDIS = True # 配置说明: # SCROLL_DISTANCE_RATIO 控制翻页时的滑动距离(在 Config.py 中修改)。 # 对于华为 Mate 20x 等分辨率较低或屏幕较小的手机,如果发现翻页时跳过了某些场站, # 请尝试减小 SCROLL_DISTANCE_RATIO(例如设置为 0.4 或 0.3)。 # 这样每次滑动的距离变短,可以确保所有场站都能被完整显示并识别。 # 配置日志输出 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger("StationList") # 强制设置所有相关模块的日志级别为 INFO,防止被第三方库干扰 logging.getLogger("PaddleOCRKit").setLevel(logging.INFO) logging.getLogger("Util.Kit").setLevel(logging.INFO) logging.getLogger("OpenXinDianTu").setLevel(logging.INFO) logging.getLogger("FullProcess").setLevel(logging.INFO) class XinDianTuCrawler(BaseCrawler): """ 新电途小程序爬虫实现 """ def __init__(self, service=None): super().__init__(service) # 初始化配置参数 self.max_scrolls = MAX_SCROLLS self.uploader = ObsUploader() self.redis_kit = RedisKit() async def start(self): """ 实现 BaseCrawler 的启动入口 """ # 兼容旧逻辑,直接调用 main await main(self.service) async def open_app(self): # 实际逻辑在 Opener.py,此处可作为封装层 pass async def crawl_list(self): # 实际逻辑在 get_station_list,此处可作为封装层 pass async def crawl_detail(self, station_info): pass async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS): """ 获取场站列表并处理翻页 """ all_stations = [] seen_names = set() redis_kit = RedisKit() # 确保在“找桩”或者首页列表界面 window_size = d.window_size() w, h = window_size[0], window_size[1] # 获取设备信息以用于坐标计算 device_info = d.info device_info['width'] = w device_info['height'] = h logger.info(f"设备信息: {device_info.get('productName')} | 窗口分辨率: {w}x{h}") max_distance_reached = False stop_reason = "max_scrolls_reached" # 默认停止原因:达到最大翻页次数 # 存储后台异步任务 background_tasks = [] for i in range(max_scrolls + 1): if max_distance_reached: logger.info("已达到最大抓取距离,停止翻页。") stop_reason = "max_distance_reached" break logger.info(f"正在处理第 {i + 1} 页...") # 1. 拍摄截图 logger.info(f"Step [1/6] 正在拍摄列表页截图...") t_shot = time.time() image_uuid = str(uuid.uuid4()) # 使用相对路径: 基于当前脚本目录下的 Images 文件夹 base_dir = os.path.dirname(os.path.abspath(__file__)) save_dir = TEMP_IMAGE_DIR screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir) logger.info(f"Step [1/6] 列表页截图已完成: {screenshot_path} (耗时: {time.time() - t_shot:.2f}s)") # 校验截图实际尺寸 img_w, img_h = 0, 0 try: with Image.open(screenshot_path) as img: img_w, img_h = img.size logger.info(f"截图实际尺寸: {img_w}x{img_h}") # 如果截图尺寸与窗口尺寸不符,更新 device_info 以适配坐标换算 if img_w != w or img_h != h: logger.warning(f"检测到截图尺寸 ({img_w}x{img_h}) 与窗口尺寸 ({w}x{h}) 不一致,将以截图尺寸为准进行坐标换算。") device_info['width'] = img_w device_info['height'] = img_h except Exception as e: logger.warning(f"无法获取截图尺寸信息: {e}") # 2. 使用 Kit.crop_cards_from_image 进行切片和生成 _vl.jpg logger.info("Step [2/6] 正在执行本地图形学切片分析,识别场站卡片并生成绿色方框图...") t_crop = time.time() # 这会生成 _vl.jpg 和 .json Kit.crop_cards_from_image(screenshot_path) logger.info(f"Step [2/6] 图形学切片分析完成 (耗时: {time.time() - t_crop:.2f}s)") # 读取生成的元数据 json_path = screenshot_path.replace(".jpg", ".json") vl_img_path = screenshot_path.replace(".jpg", "_vl.jpg") if not os.path.exists(json_path) or not os.path.exists(vl_img_path): logger.error(f"❌ Step [2/6] 失败: 未生成有效的 JSON 或 VL 图片 (JSON: {os.path.exists(json_path)}, VL: {os.path.exists(vl_img_path)})") logger.warning("可能未识别到任何卡片,跳过当前页") continue logger.info(f"Step [3/6] 绿色方框图已生成: {vl_img_path}") with open(json_path, 'r', encoding='utf-8') as f: json_metadata = json.load(f) # --- New Logic: Local OCR via PaddleOCRKit --- logger.info("Step [4/6] 正在开始本地 OCR 识别流程,将识别出的卡片逐一交给 OCR...") t_ocr_total = time.time() stations = [] try: ocr_kit = get_ocr_kit() # 读取原始截图进行裁剪 (比读取 _vl.jpg 更干净,没有绿框干扰) original_img = cv2.imread(screenshot_path) if "cards" in json_metadata and original_img is not None: h_img, w_img = original_img.shape[:2] num_cards = len(json_metadata['cards']) logger.info(f"检测到 {num_cards} 个场站卡片,准备开始逐一 OCR 识别...") for idx, card in enumerate(json_metadata["cards"]): # card: {"id": 1, "rect": [x1, y1, x2, y2], "click_point": [cx, cy]} rect = card.get("rect") if not rect: continue x1, y1, x2, y2 = rect # 边界检查 x1 = max(0, min(x1, w_img)) x2 = max(0, min(x2, w_img)) y1 = max(0, min(y1, h_img)) y2 = max(0, min(y2, h_img)) if x2 <= x1 or y2 <= y1: continue # 裁剪卡片 cropped_card = original_img[y1:y2, x1:x2] # 识别 logger.info(f"正在识别第 {idx+1}/{num_cards} 个卡片 (区域: {rect})...") t_card_start = time.time() parsed_data = ocr_kit.recognize(cropped_card) t_card_end = time.time() # Log detailed OCR result logger.info(f"卡片 {idx+1} OCR 识别耗时: {t_card_end - t_card_start:.2f}s") if parsed_data and parsed_data.get("station_name"): # 格式化数据以匹配原有结构 piles_list = parsed_data.get("piles", []) piles_str_parts = [] for p in piles_list: p_type = p.get("type", "") p_free = p.get("free", 0) p_total = p.get("total", 0) piles_str_parts.append(f"{p_type}:{p_free}/{p_total}") piles_str = " ".join(piles_str_parts) station_info = { "station_name": parsed_data.get("station_name"), "price": str(parsed_data.get("price")) if parsed_data.get("price") is not None else "", "pro_price": "", # 暂未解析 "piles": piles_str, "distance": parsed_data.get("distance", ""), "uia_center_x": card["click_point"][0], "uia_center_y": card["click_point"][1], "tags": parsed_data.get("tags", []), "parking_info": parsed_data.get("parking", "") } logger.info(f"✅ 成功解析场站: {station_info['station_name']} | 距离: {station_info['distance']}") # 立即记录场站基础状态信息 (电桩、价格等) try: await service.record_station_status(station_info) except Exception as e: logger.error(f"❌ 记录场站状态失败: {e}") stations.append(station_info) else: logger.warning(f"⚠️ 卡片 {idx+1} 未能识别出有效场站名称") except Exception as e: logger.error(f"❌ Local OCR processing failed: {e}", exc_info=True) logger.info(f"Step [5/6] 本地 OCR 解析完成 (总耗时: {time.time() - t_ocr_total:.2f}s) | 成功识别: {len(stations)}/{len(json_metadata.get('cards', []))}") if not stations: logger.warning("当前页面未检测到任何有效的场站信息") else: logger.info(f"Step [6/6] 准备开始逐一点击场站进入详情页 (共 {len(stations)} 个)...") new_stations_found = False for s_idx, s in enumerate(stations): name = s.get("station_name") # 过滤掉明显的非场站名称 if not name: continue # --- 短期去重:检查 Redis 缓存 --- # 2分钟内如果处理过该场站,则跳过 redis_key = f"processed_station:{name}" if await redis_kit.exists(redis_key): logger.info(f"场站 {name} 在 2 分钟内已处理过,跳过") continue if name in seen_names: continue # 使用从 JSON 元数据中恢复的点击坐标 x = s.get("uia_center_x") y = s.get("uia_center_y") if x is None or y is None: logger.warning(f"场站 {name} 缺少坐标信息,无法进入详情页") continue seen_names.add(name) all_stations.append(s) new_stations_found = True # 打印基本信息 price = s.get("price") pro_price = s.get("pro_price") piles_str = s.get("piles", "") distance_str = s.get("distance", "") # 检查距离 dist_log_part = "" if distance_str: try: import re dist_match = re.search(r"(\d+(\.\d+)?)", distance_str) if dist_match: dist_val = float(dist_match.group(1)) is_km = "km" in distance_str.lower() dist_km = dist_val if is_km else dist_val / 1000.0 # 将距离解析结果和限制检查合并到日志中 dist_log_part = f" (解析: {dist_km:.2f}km / 限: {MAX_CRAWL_DISTANCE}km)" if dist_km >= MAX_CRAWL_DISTANCE: logger.info(f"当前场站 {name} 距离 {dist_km}km (原始: {distance_str}) 已达到或超过限制 {MAX_CRAWL_DISTANCE}km。") max_distance_reached = True except Exception as e: logger.warning(f"解析距离出错: {distance_str}, {e}") pro_info = f" | PRO会员价: {pro_price}" if pro_price else "" # 确保距离始终显示,如果为空则显示 "未知" display_distance = distance_str if distance_str else "未知" logger.info(f"正在处理第 {s_idx+1} 个场站: {name} | 价格: {price}{pro_info} | 枪: {piles_str} | 距离: {display_distance}{dist_log_part}") if max_distance_reached: break # --- 点击进入详情页记录地址 --- # 使用相对坐标点击,以适应不同分辨率 if img_w > 0 and img_h > 0: rel_x = max(0.0, min(1.0, x / img_w)) rel_y = max(0.0, min(1.0, y / img_h)) logger.info(f"👉 正在点击进入详情页: {name} (相对坐标: {rel_x:.3f}, {rel_y:.3f})") d.click(rel_x, rel_y) else: logger.info(f"👉 正在点击进入详情页: {name} (绝对坐标: {x}, {y})") d.click(x, y) await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD) # 等待详情页加载 # 尝试获取并保存地址 address_recorded = False for retry in range(2): # 1. 拍摄详情页截图 t_d_shot = time.time() detail_uuid = str(uuid.uuid4()) detail_screenshot_path = take_screenshot(d, detail_uuid, save_dir=save_dir) logger.info(f"Step [详情页截图] 耗时: {time.time() - t_d_shot:.4f}s") # 2. 上传至 OBS t_d_up = time.time() detail_object_key = f"{OBS_TMP_PREFIX}/{detail_uuid}.jpg" up_success, _ = uploader.upload_file(detail_object_key, detail_screenshot_path) logger.info(f"Step [上传详情页截图] 耗时: {time.time() - t_d_up:.4f}s") if up_success: detail_url = f"https://{CDN_DOMAIN}/{detail_object_key}" # 3. 解析并保存地址 (异步后台处理) # 定义异步任务函数 async def _process_address_task(task_name, task_url, task_dev_info, task_redis_key): t_addr_task = time.time() try: addr_res = await service.process_station_address(task_name, task_url, device_info=task_dev_info) logger.info(f"Step [解析并保存地址-后台] 耗时: {time.time() - t_addr_task:.4f}s") if addr_res.get("address"): logger.info(f"成功记录地址: {addr_res.get('address')}") # 记录到 Redis,设置过期时间 await redis_kit.set_data(task_redis_key, "1", expire=REDIS_STATION_EXPIRE) except Exception as e: logger.error(f"后台处理地址失败 ({task_name}): {e}") # 创建并启动任务 addr_task = asyncio.create_task(_process_address_task(name, detail_url, device_info, redis_key)) background_tasks.append(addr_task) logger.info(f"已提交地址解析任务到后台: {name}") # 标记为已记录以跳出 retry 循环 address_recorded = True # 清理详情页临时截图 if not KEEP_SCREENSHOTS and os.path.exists(detail_screenshot_path): os.remove(detail_screenshot_path) break # 直接跳出 retry 循环 # --- 抓取价格时段信息 (New Feature) --- # 仅在测试模式下或特定需求下启用 try: logger.info(f"开始抓取价格时段信息: {name}") # 使用几何特征识别 "全部时段" 按钮 # 临时截图 temp_uuid = "temp_find_expand" screenshot_path = take_screenshot(d, temp_uuid, save_dir=TEMP_IMAGE_DIR) # 尝试识别,将调试图片保存到 Images 目录 t_find = time.time() pos = Kit.find_expand_button_position(screenshot_path, debug_dir=save_dir, debug_filename_prefix=name) logger.info(f"Step [识别展开按钮] 耗时: {time.time() - t_find:.4f}s") # 清理截图 try: os.remove(screenshot_path) except: pass if pos: x, y = pos logger.info(f"通过几何特征找到 '全部时段' 按钮: ({x}, {y})") d.click(x, y) await asyncio.sleep(1.5) # 抓取3屏截图 price_image_urls = [] for p_idx in range(1, 4): # 使用场站名称的MD5值作为文件名前缀,避免中文和特殊字符导致的URL问题 t_p_loop = time.time() name_md5 = hashlib.md5(name.encode('utf-8')).hexdigest() p_uuid = f"{name_md5}_price_{p_idx}" p_path = take_screenshot(d, p_uuid, save_dir=save_dir) logger.info(f"已保存价格时段截图 {p_idx} (Station: {name}, MD5: {name_md5}): {p_path}") # 上传到 OBS p_object_key = f"{OBS_TMP_PREFIX}/{p_uuid}.jpg" success, _ = uploader.upload_file(p_object_key, p_path) if success: p_url = f"https://{CDN_DOMAIN}/{p_object_key}" price_image_urls.append(p_url) logger.info(f"截图上传成功: {p_url}") else: logger.warning(f"截图上传失败: {p_path}") logger.info(f"Step [价格截图与上传-{p_idx}] 耗时: {time.time() - t_p_loop:.4f}s") if p_idx < 3: d.swipe_ext("up", scale=0.7) await asyncio.sleep(1.0) # 调用服务解析价格时段 (异步后台处理) if price_image_urls: logger.info(f"正在调用接口解析电费时段数据 ({len(price_image_urls)} 张图片) - 转入后台...") async def _process_price_task(task_name, task_urls, task_dev_info): t_price_task = time.time() try: await service.process_price_schedule(task_name, task_urls, device_info=task_dev_info) logger.info(f"Step [解析电费时段数据-后台] 耗时: {time.time() - t_price_task:.4f}s") logger.info(f"电费时段数据解析完成: {task_name}") except Exception as e: logger.error(f"后台处理价格时段失败 ({task_name}): {e}") price_task = asyncio.create_task(_process_price_task(name, price_image_urls, device_info)) background_tasks.append(price_task) else: logger.warning("未能获取到有效的价格时段截图,跳过解析") else: logger.warning(f"未能通过几何特征识别 '全部时段' 按钮") except Exception as e: logger.warning(f"抓取价格时段信息时发生异常: {e}") # 测试模式:仅处理第一个场站 # logger.info("测试模式:仅处理第一个场站,即将退出 (Y3)") # return all_stations # 返回列表页 d.press("back") await asyncio.sleep(WAIT_BACK_TO_LIST) # 等待列表页重新稳定 if not new_stations_found and i > 0: logger.info("未发现更多新场站,停止下拉。") # 如果需要强行爬取所有,可以注释掉 break # break # 5. 如果还没到最后一页,执行下拉 if i < max_scrolls and not max_distance_reached: logger.info(f"执行下拉翻页 (距离比例: {SCROLL_DISTANCE_RATIO})...") # 根据配置的比例计算滑动起始和终点 # 保证滑动在屏幕中心区域进行 start_y = 0.5 + (SCROLL_DISTANCE_RATIO / 2) end_y = 0.5 - (SCROLL_DISTANCE_RATIO / 2) d.swipe(w * 0.5, h * start_y, w * 0.5, h * end_y, duration=0.5) await asyncio.sleep(WAIT_AFTER_SCROLL) # 等待页面加载和列表稳定 # 清理列表页截图 if not KEEP_SCREENSHOTS: if os.path.exists(screenshot_path): os.remove(screenshot_path) if os.path.exists(vl_img_path): os.remove(vl_img_path) if os.path.exists(json_path): os.remove(json_path) logger.info(f"爬取任务结束,正在等待 {len(background_tasks)} 个后台处理任务完成...") if background_tasks: await asyncio.gather(*background_tasks) logger.info("所有后台任务已完成。") logger.info(f"任务结束,共采集到 {len(all_stations)} 个场站。停止原因: {stop_reason}") return all_stations def clean_images_dir(): """清理 Images 目录下除保留文件外的所有文件""" base_dir = os.path.dirname(os.path.abspath(__file__)) images_dir = os.path.join(base_dir, "Images") if not os.path.exists(images_dir): return keep_files = {'1.jpg', '2.jpg', '3.jpg', '4.jpg'} logger.info(f"正在清理 Images 目录 (保留: {keep_files})...") deleted_count = 0 for filename in os.listdir(images_dir): if filename in keep_files: continue file_path = os.path.join(images_dir, filename) try: if os.path.isfile(file_path): os.remove(file_path) deleted_count += 1 except Exception as e: logger.warning(f"删除文件失败 {filename}: {e}") logger.info(f"清理完成,共删除 {deleted_count} 个文件。") async def clean_redis_data(redis_kit): """清除 Redis 中的场站处理记录""" if TEST_CLEAR_REDIS: logger.info("测试模式开启:正在清除 Redis 中的场站处理记录 (processed_station:*)...") keys = await redis_kit.keys("processed_station:*") if keys: count = await redis_kit.delete(*keys) logger.info(f"已清除 {count} 条场站处理记录") else: logger.info("未发现旧的场站处理记录") async def main(service=None, do_cleanup=True): # 连接设备 d = u2.connect() # 初始化服务 should_close_service = False if service is None: service = XinDianTuService() should_close_service = True # 初始化数据库连接(XinDianTuService 需要) await service.init_db() uploader = ObsUploader() redis_kit = RedisKit() try: if do_cleanup: # 清理图片目录 clean_images_dir() # [Testing] 如果配置为测试模式,启动时清除 Redis 记录 await clean_redis_data(redis_kit) # 清理过期数据 # await service.cleanup_old_data() # 获取场站列表 stations = await get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS) if stations: logger.info("场站列表采集完成。") else: logger.warning("未采集到任何场站信息。") return True except Exception as e: logger.exception(f"运行过程中出现异常: {e}") return False finally: # 关闭数据库连接 if should_close_service: await service.close_db() if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: logger.info("程序被用户中断.") except Exception as e: logger.exception(f"程序崩溃: {e}")