Files
aiData/Apps/XinDianTu/Crawler.py
HuangHai ca23ebf606 'commit'
2026-01-12 08:09:32 +08:00

582 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import asyncio
import logging
import uuid
import os
import sys
import json
import time
import hashlib
from PIL import Image
# 将项目根目录添加到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
sys.path.append(project_root)
import uiautomator2 as u2
from Core.BaseCrawler import BaseCrawler
from Util import Kit
from Util.Kit import take_screenshot
from Util.ObsUtil import ObsUploader
from Util.RedisKit import RedisKit
from Util.PaddleOCRKit import get_ocr_kit
import cv2
from Apps.XinDianTu.Service import XinDianTuService
from Config.Config import (
OBS_TMP_PREFIX, CDN_DOMAIN, SCROLL_DISTANCE_RATIO,
MAX_SCROLLS, REDIS_STATION_EXPIRE,
WAIT_DETAIL_PAGE_LOAD, WAIT_BACK_TO_LIST, WAIT_AFTER_SCROLL,
MAX_CRAWL_DISTANCE, TEMP_IMAGE_DIR
)
# --- 用户配置区域 ---
# 是否保留截图文件True=保留备查False=随用随删)
KEEP_SCREENSHOTS = True
# [Testing] 是否在启动时清除 Redis 中的场站处理记录
# True: 每次运行前清除记录,方便反复测试同一个场站
# False: 生产模式,保留记录以避免重复爬取
TEST_CLEAR_REDIS = True
# 配置说明:
# SCROLL_DISTANCE_RATIO 控制翻页时的滑动距离(在 Config.py 中修改)。
# 对于华为 Mate 20x 等分辨率较低或屏幕较小的手机,如果发现翻页时跳过了某些场站,
# 请尝试减小 SCROLL_DISTANCE_RATIO例如设置为 0.4 或 0.3)。
# 这样每次滑动的距离变短,可以确保所有场站都能被完整显示并识别。
# 配置日志输出
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("StationList")
# 强制设置所有相关模块的日志级别为 INFO防止被第三方库干扰
logging.getLogger("PaddleOCRKit").setLevel(logging.INFO)
logging.getLogger("Util.Kit").setLevel(logging.INFO)
logging.getLogger("OpenXinDianTu").setLevel(logging.INFO)
logging.getLogger("FullProcess").setLevel(logging.INFO)
class XinDianTuCrawler(BaseCrawler):
"""
新电途小程序爬虫实现
"""
def __init__(self, service=None):
super().__init__(service)
# 初始化配置参数
self.max_scrolls = MAX_SCROLLS
self.uploader = ObsUploader()
self.redis_kit = RedisKit()
async def start(self):
"""
实现 BaseCrawler 的启动入口
"""
# 兼容旧逻辑,直接调用 main
await main(self.service)
async def open_app(self):
# 实际逻辑在 Opener.py此处可作为封装层
pass
async def crawl_list(self):
# 实际逻辑在 get_station_list此处可作为封装层
pass
async def crawl_detail(self, station_info):
pass
async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS):
"""
获取场站列表并处理翻页
"""
all_stations = []
seen_names = set()
redis_kit = RedisKit()
# 确保在“找桩”或者首页列表界面
window_size = d.window_size()
w, h = window_size[0], window_size[1]
# 获取设备信息以用于坐标计算
device_info = d.info
device_info['width'] = w
device_info['height'] = h
logger.info(f"设备信息: {device_info.get('productName')} | 窗口分辨率: {w}x{h}")
max_distance_reached = False
stop_reason = "max_scrolls_reached" # 默认停止原因:达到最大翻页次数
# 存储后台异步任务
background_tasks = []
for i in range(max_scrolls + 1):
if max_distance_reached:
logger.info("已达到最大抓取距离,停止翻页。")
stop_reason = "max_distance_reached"
break
logger.info(f"正在处理第 {i + 1} 页...")
# 1. 拍摄截图
logger.info(f"Step [1/6] 正在拍摄列表页截图...")
t_shot = time.time()
image_uuid = str(uuid.uuid4())
# 使用相对路径: 基于当前脚本目录下的 Images 文件夹
base_dir = os.path.dirname(os.path.abspath(__file__))
save_dir = TEMP_IMAGE_DIR
screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir)
logger.info(f"Step [1/6] 列表页截图已完成: {screenshot_path} (耗时: {time.time() - t_shot:.2f}s)")
# 校验截图实际尺寸
img_w, img_h = 0, 0
try:
with Image.open(screenshot_path) as img:
img_w, img_h = img.size
logger.info(f"截图实际尺寸: {img_w}x{img_h}")
# 如果截图尺寸与窗口尺寸不符,更新 device_info 以适配坐标换算
if img_w != w or img_h != h:
logger.warning(f"检测到截图尺寸 ({img_w}x{img_h}) 与窗口尺寸 ({w}x{h}) 不一致,将以截图尺寸为准进行坐标换算。")
device_info['width'] = img_w
device_info['height'] = img_h
except Exception as e:
logger.warning(f"无法获取截图尺寸信息: {e}")
# 2. 使用 Kit.crop_cards_from_image 进行切片和生成 _vl.jpg
logger.info("Step [2/6] 正在执行本地图形学切片分析,识别场站卡片并生成绿色方框图...")
t_crop = time.time()
# 这会生成 _vl.jpg 和 .json
Kit.crop_cards_from_image(screenshot_path)
logger.info(f"Step [2/6] 图形学切片分析完成 (耗时: {time.time() - t_crop:.2f}s)")
# 读取生成的元数据
json_path = screenshot_path.replace(".jpg", ".json")
vl_img_path = screenshot_path.replace(".jpg", "_vl.jpg")
if not os.path.exists(json_path) or not os.path.exists(vl_img_path):
logger.error(f"❌ Step [2/6] 失败: 未生成有效的 JSON 或 VL 图片 (JSON: {os.path.exists(json_path)}, VL: {os.path.exists(vl_img_path)})")
logger.warning("可能未识别到任何卡片,跳过当前页")
continue
logger.info(f"Step [3/6] 绿色方框图已生成: {vl_img_path}")
with open(json_path, 'r', encoding='utf-8') as f:
json_metadata = json.load(f)
# --- New Logic: Local OCR via PaddleOCRKit ---
logger.info("Step [4/6] 正在开始本地 OCR 识别流程,将识别出的卡片逐一交给 OCR...")
t_ocr_total = time.time()
stations = []
try:
ocr_kit = get_ocr_kit()
# 读取原始截图进行裁剪 (比读取 _vl.jpg 更干净,没有绿框干扰)
original_img = cv2.imread(screenshot_path)
if "cards" in json_metadata and original_img is not None:
h_img, w_img = original_img.shape[:2]
num_cards = len(json_metadata['cards'])
logger.info(f"检测到 {num_cards} 个场站卡片,准备开始逐一 OCR 识别...")
for idx, card in enumerate(json_metadata["cards"]):
# card: {"id": 1, "rect": [x1, y1, x2, y2], "click_point": [cx, cy]}
rect = card.get("rect")
if not rect: continue
x1, y1, x2, y2 = rect
# 边界检查
x1 = max(0, min(x1, w_img))
x2 = max(0, min(x2, w_img))
y1 = max(0, min(y1, h_img))
y2 = max(0, min(y2, h_img))
if x2 <= x1 or y2 <= y1: continue
# 裁剪卡片
cropped_card = original_img[y1:y2, x1:x2]
# 识别
logger.info(f"正在识别第 {idx+1}/{num_cards} 个卡片 (区域: {rect})...")
t_card_start = time.time()
parsed_data = ocr_kit.recognize(cropped_card)
t_card_end = time.time()
# Log detailed OCR result
logger.info(f"卡片 {idx+1} OCR 识别耗时: {t_card_end - t_card_start:.2f}s")
if parsed_data and parsed_data.get("station_name"):
# 格式化数据以匹配原有结构
piles_list = parsed_data.get("piles", [])
piles_str_parts = []
for p in piles_list:
p_type = p.get("type", "")
p_free = p.get("free", 0)
p_total = p.get("total", 0)
piles_str_parts.append(f"{p_type}:{p_free}/{p_total}")
piles_str = " ".join(piles_str_parts)
station_info = {
"station_name": parsed_data.get("station_name"),
"price": str(parsed_data.get("price")) if parsed_data.get("price") is not None else "",
"pro_price": "", # 暂未解析
"piles": piles_str,
"distance": parsed_data.get("distance", ""),
"uia_center_x": card["click_point"][0],
"uia_center_y": card["click_point"][1],
"tags": parsed_data.get("tags", []),
"parking_info": parsed_data.get("parking", "")
}
logger.info(f"✅ 成功解析场站: {station_info['station_name']} | 距离: {station_info['distance']}")
# 立即记录场站基础状态信息 (电桩、价格等)
try:
await service.record_station_status(station_info)
except Exception as e:
logger.error(f"❌ 记录场站状态失败: {e}")
stations.append(station_info)
else:
logger.warning(f"⚠️ 卡片 {idx+1} 未能识别出有效场站名称")
except Exception as e:
logger.error(f"❌ Local OCR processing failed: {e}", exc_info=True)
logger.info(f"Step [5/6] 本地 OCR 解析完成 (总耗时: {time.time() - t_ocr_total:.2f}s) | 成功识别: {len(stations)}/{len(json_metadata.get('cards', []))}")
if not stations:
logger.warning("当前页面未检测到任何有效的场站信息")
else:
logger.info(f"Step [6/6] 准备开始逐一点击场站进入详情页 (共 {len(stations)} 个)...")
new_stations_found = False
for s_idx, s in enumerate(stations):
name = s.get("station_name")
# 过滤掉明显的非场站名称
if not name:
continue
# --- 短期去重:检查 Redis 缓存 ---
# 2分钟内如果处理过该场站则跳过
redis_key = f"processed_station:{name}"
if await redis_kit.exists(redis_key):
logger.info(f"场站 {name} 在 2 分钟内已处理过,跳过")
continue
if name in seen_names:
continue
# 使用从 JSON 元数据中恢复的点击坐标
x = s.get("uia_center_x")
y = s.get("uia_center_y")
if x is None or y is None:
logger.warning(f"场站 {name} 缺少坐标信息,无法进入详情页")
continue
seen_names.add(name)
all_stations.append(s)
new_stations_found = True
# 打印基本信息
price = s.get("price")
pro_price = s.get("pro_price")
piles_str = s.get("piles", "")
distance_str = s.get("distance", "")
# 检查距离
dist_log_part = ""
if distance_str:
try:
import re
dist_match = re.search(r"(\d+(\.\d+)?)", distance_str)
if dist_match:
dist_val = float(dist_match.group(1))
is_km = "km" in distance_str.lower()
dist_km = dist_val if is_km else dist_val / 1000.0
# 将距离解析结果和限制检查合并到日志中
dist_log_part = f" (解析: {dist_km:.2f}km / 限: {MAX_CRAWL_DISTANCE}km)"
if dist_km >= MAX_CRAWL_DISTANCE:
logger.info(f"当前场站 {name} 距离 {dist_km}km (原始: {distance_str}) 已达到或超过限制 {MAX_CRAWL_DISTANCE}km。")
max_distance_reached = True
except Exception as e:
logger.warning(f"解析距离出错: {distance_str}, {e}")
pro_info = f" | PRO会员价: {pro_price}" if pro_price else ""
# 确保距离始终显示,如果为空则显示 "未知"
display_distance = distance_str if distance_str else "未知"
logger.info(f"正在处理第 {s_idx+1} 个场站: {name} | 价格: {price}{pro_info} | 枪: {piles_str} | 距离: {display_distance}{dist_log_part}")
if max_distance_reached:
break
# --- 点击进入详情页记录地址 ---
# 使用相对坐标点击,以适应不同分辨率
if img_w > 0 and img_h > 0:
rel_x = max(0.0, min(1.0, x / img_w))
rel_y = max(0.0, min(1.0, y / img_h))
logger.info(f"👉 正在点击进入详情页: {name} (相对坐标: {rel_x:.3f}, {rel_y:.3f})")
d.click(rel_x, rel_y)
else:
logger.info(f"👉 正在点击进入详情页: {name} (绝对坐标: {x}, {y})")
d.click(x, y)
await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD) # 等待详情页加载
# 尝试获取并保存地址
address_recorded = False
for retry in range(2):
# 1. 拍摄详情页截图
t_d_shot = time.time()
detail_uuid = str(uuid.uuid4())
detail_screenshot_path = take_screenshot(d, detail_uuid, save_dir=save_dir)
logger.info(f"Step [详情页截图] 耗时: {time.time() - t_d_shot:.4f}s")
# 2. 上传至 OBS
t_d_up = time.time()
detail_object_key = f"{OBS_TMP_PREFIX}/{detail_uuid}.jpg"
up_success, _ = uploader.upload_file(detail_object_key, detail_screenshot_path)
logger.info(f"Step [上传详情页截图] 耗时: {time.time() - t_d_up:.4f}s")
if up_success:
detail_url = f"https://{CDN_DOMAIN}/{detail_object_key}"
# 3. 解析并保存地址 (异步后台处理)
# 定义异步任务函数
async def _process_address_task(task_name, task_url, task_dev_info, task_redis_key):
t_addr_task = time.time()
try:
addr_res = await service.process_station_address(task_name, task_url, device_info=task_dev_info)
logger.info(f"Step [解析并保存地址-后台] 耗时: {time.time() - t_addr_task:.4f}s")
if addr_res.get("address"):
logger.info(f"成功记录地址: {addr_res.get('address')}")
# 记录到 Redis设置过期时间
await redis_kit.set_data(task_redis_key, "1", expire=REDIS_STATION_EXPIRE)
except Exception as e:
logger.error(f"后台处理地址失败 ({task_name}): {e}")
# 创建并启动任务
addr_task = asyncio.create_task(_process_address_task(name, detail_url, device_info, redis_key))
background_tasks.append(addr_task)
logger.info(f"已提交地址解析任务到后台: {name}")
# 标记为已记录以跳出 retry 循环
address_recorded = True
# 清理详情页临时截图
if not KEEP_SCREENSHOTS and os.path.exists(detail_screenshot_path):
os.remove(detail_screenshot_path)
break # 直接跳出 retry 循环
# --- 抓取价格时段信息 (New Feature) ---
# 仅在测试模式下或特定需求下启用
try:
logger.info(f"开始抓取价格时段信息: {name}")
# 使用几何特征识别 "全部时段" 按钮
# 临时截图
temp_uuid = "temp_find_expand"
screenshot_path = take_screenshot(d, temp_uuid, save_dir=TEMP_IMAGE_DIR)
# 尝试识别,将调试图片保存到 Images 目录
t_find = time.time()
pos = Kit.find_expand_button_position(screenshot_path, debug_dir=save_dir, debug_filename_prefix=name)
logger.info(f"Step [识别展开按钮] 耗时: {time.time() - t_find:.4f}s")
# 清理截图
try:
os.remove(screenshot_path)
except:
pass
if pos:
x, y = pos
logger.info(f"通过几何特征找到 '全部时段' 按钮: ({x}, {y})")
d.click(x, y)
await asyncio.sleep(1.5)
# 抓取3屏截图
price_image_urls = []
for p_idx in range(1, 4):
# 使用场站名称的MD5值作为文件名前缀避免中文和特殊字符导致的URL问题
t_p_loop = time.time()
name_md5 = hashlib.md5(name.encode('utf-8')).hexdigest()
p_uuid = f"{name_md5}_price_{p_idx}"
p_path = take_screenshot(d, p_uuid, save_dir=save_dir)
logger.info(f"已保存价格时段截图 {p_idx} (Station: {name}, MD5: {name_md5}): {p_path}")
# 上传到 OBS
p_object_key = f"{OBS_TMP_PREFIX}/{p_uuid}.jpg"
success, _ = uploader.upload_file(p_object_key, p_path)
if success:
p_url = f"https://{CDN_DOMAIN}/{p_object_key}"
price_image_urls.append(p_url)
logger.info(f"截图上传成功: {p_url}")
else:
logger.warning(f"截图上传失败: {p_path}")
logger.info(f"Step [价格截图与上传-{p_idx}] 耗时: {time.time() - t_p_loop:.4f}s")
if p_idx < 3:
d.swipe_ext("up", scale=0.7)
await asyncio.sleep(1.0)
# 调用服务解析价格时段 (异步后台处理)
if price_image_urls:
logger.info(f"正在调用接口解析电费时段数据 ({len(price_image_urls)} 张图片) - 转入后台...")
async def _process_price_task(task_name, task_urls, task_dev_info):
t_price_task = time.time()
try:
await service.process_price_schedule(task_name, task_urls, device_info=task_dev_info)
logger.info(f"Step [解析电费时段数据-后台] 耗时: {time.time() - t_price_task:.4f}s")
logger.info(f"电费时段数据解析完成: {task_name}")
except Exception as e:
logger.error(f"后台处理价格时段失败 ({task_name}): {e}")
price_task = asyncio.create_task(_process_price_task(name, price_image_urls, device_info))
background_tasks.append(price_task)
else:
logger.warning("未能获取到有效的价格时段截图,跳过解析")
else:
logger.warning(f"未能通过几何特征识别 '全部时段' 按钮")
except Exception as e:
logger.warning(f"抓取价格时段信息时发生异常: {e}")
# 测试模式:仅处理第一个场站
# logger.info("测试模式:仅处理第一个场站,即将退出 (Y3)")
# return all_stations
# 返回列表页
d.press("back")
await asyncio.sleep(WAIT_BACK_TO_LIST) # 等待列表页重新稳定
if not new_stations_found and i > 0:
logger.info("未发现更多新场站,停止下拉。")
# 如果需要强行爬取所有,可以注释掉 break
# break
# 5. 如果还没到最后一页,执行下拉
if i < max_scrolls and not max_distance_reached:
logger.info(f"执行下拉翻页 (距离比例: {SCROLL_DISTANCE_RATIO})...")
# 根据配置的比例计算滑动起始和终点
# 保证滑动在屏幕中心区域进行
start_y = 0.5 + (SCROLL_DISTANCE_RATIO / 2)
end_y = 0.5 - (SCROLL_DISTANCE_RATIO / 2)
d.swipe(w * 0.5, h * start_y, w * 0.5, h * end_y, duration=0.5)
await asyncio.sleep(WAIT_AFTER_SCROLL) # 等待页面加载和列表稳定
# 清理列表页截图
if not KEEP_SCREENSHOTS:
if os.path.exists(screenshot_path):
os.remove(screenshot_path)
if os.path.exists(vl_img_path):
os.remove(vl_img_path)
if os.path.exists(json_path):
os.remove(json_path)
logger.info(f"爬取任务结束,正在等待 {len(background_tasks)} 个后台处理任务完成...")
if background_tasks:
await asyncio.gather(*background_tasks)
logger.info("所有后台任务已完成。")
logger.info(f"任务结束,共采集到 {len(all_stations)} 个场站。停止原因: {stop_reason}")
return all_stations
def clean_images_dir():
"""清理 Images 目录下除保留文件外的所有文件"""
base_dir = os.path.dirname(os.path.abspath(__file__))
images_dir = os.path.join(base_dir, "Images")
if not os.path.exists(images_dir):
return
keep_files = {'1.jpg', '2.jpg', '3.jpg', '4.jpg'}
logger.info(f"正在清理 Images 目录 (保留: {keep_files})...")
deleted_count = 0
for filename in os.listdir(images_dir):
if filename in keep_files:
continue
file_path = os.path.join(images_dir, filename)
try:
if os.path.isfile(file_path):
os.remove(file_path)
deleted_count += 1
except Exception as e:
logger.warning(f"删除文件失败 {filename}: {e}")
logger.info(f"清理完成,共删除 {deleted_count} 个文件。")
async def clean_redis_data(redis_kit):
"""清除 Redis 中的场站处理记录"""
if TEST_CLEAR_REDIS:
logger.info("测试模式开启:正在清除 Redis 中的场站处理记录 (processed_station:*)...")
keys = await redis_kit.keys("processed_station:*")
if keys:
count = await redis_kit.delete(*keys)
logger.info(f"已清除 {count} 条场站处理记录")
else:
logger.info("未发现旧的场站处理记录")
async def main(service=None, do_cleanup=True):
# 连接设备
d = u2.connect()
# 初始化服务
should_close_service = False
if service is None:
service = XinDianTuService()
should_close_service = True
# 初始化数据库连接XinDianTuService 需要)
await service.init_db()
uploader = ObsUploader()
redis_kit = RedisKit()
try:
if do_cleanup:
# 清理图片目录
clean_images_dir()
# [Testing] 如果配置为测试模式,启动时清除 Redis 记录
await clean_redis_data(redis_kit)
# 清理过期数据
# await service.cleanup_old_data()
# 获取场站列表
stations = await get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS)
if stations:
logger.info("场站列表采集完成。")
else:
logger.warning("未采集到任何场站信息。")
return True
except Exception as e:
logger.exception(f"运行过程中出现异常: {e}")
return False
finally:
# 关闭数据库连接
if should_close_service:
await service.close_db()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("程序被用户中断.")
except Exception as e:
logger.exception(f"程序崩溃: {e}")