Files
aiData/Apps/TeLaiDian/Crawler.py
HuangHai 78f116ab84 'commit'
2026-01-16 19:30:31 +08:00

566 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import asyncio
import os
import sys
import time
import json
import cv2
from Apps.TeLaiDian.Kit import (
take_screenshot, get_image_content_md5, clean_station_name,
setup_logger, read_image, save_image, detect_warm_popup_xczs_cv
)
from Apps.TeLaiDian.ReadImageKit import ReadImageKit
from Apps.TeLaiDian.Service import TeLaiDianService
from Apps.TeLaiDian.Config.Setting import (
SCROLL_DISTANCE_RATIO, WAIT_AFTER_SCROLL, MAX_STATIONS_COUNT,
SAFE_EXCLUDE_RATIO, BOTTOM_SAFE_EXCLUDE_RATIO, WAIT_DETAIL_PAGE_LOAD,
WAIT_BACK_TO_LIST, DETAIL_SCROLL_DISTANCE_RATIO, FIRST_RUN_ONLY_ONE_STATION,
REDIS_STATION_EXPIRE
)
from Core.BaseCrawler import BaseCrawler
from Util.RedisKit import RedisKit
import uiautomator2 as u2
# 项目根目录处理
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
sys.path.append(project_root)
# 初始化日志
logger = setup_logger("TeLaiDianCrawler")
class TeLaiDianCrawler(BaseCrawler):
def __init__(self, service=None):
super().__init__(service or TeLaiDianService())
self.read_image_kit = ReadImageKit()
self.redis_kit = RedisKit()
self.pending_price_tasks = []
self.vlm_concurrency = 3
async def start(self):
"""
实现 BaseCrawler 的启动入口
注意:在 T4 等全流程脚本中,小程序由 Opener 提前打开,
此处直接开始爬取逻辑。
"""
d = u2.connect()
await self.crawl_list_logic(d)
async def open_app(self):
"""
打开特来电小程序
"""
from Apps.TeLaiDian import Opener
return await Opener.open_mini_program()
async def clear_ads(self, d, max_rounds=3):
"""
清理页面上的广告弹窗,支持多轮检测
"""
logger.info(f"--- [广告清理] 开始检测,最多尝试 {max_rounds} 轮 ---")
for i in range(max_rounds):
round_idx = i + 1
logger.info(f"[广告清理] 第 {round_idx} 轮:正在截屏分析...")
ad_screen = take_screenshot(d, f"tld_ad_check_r{round_idx}_{int(time.time())}.jpg")
try:
res = await self.read_image_kit.find_close_button_vlm(ad_screen)
if res.get("has_ad") and res.get("close_point"):
close_point = res.get("close_point")
reason = res.get("reason", "未提供原因")
w, h = d.window_size()
target_x = int(close_point[0] * w / 1000)
target_y = int(close_point[1] * h / 1000)
logger.info(f"[广告清理] 第 {round_idx}VLM 发现广告!原因: {reason}")
logger.info(f"[广告清理] 计划点击坐标: ({target_x}, {target_y}),归一化坐标: {close_point}")
# 安全校验:绝对不能点击微信小程序的胶囊按钮区 (右上角)
if target_x > w * 0.75 and target_y < 150:
logger.warning(f"[广告清理] ⚠️ 拒绝点击疑似微信胶囊按钮的区域: ({target_x}, {target_y}),跳过本轮。")
continue
logger.info(f"[广告清理] 正在执行点击关闭操作...")
d.click(target_x, target_y)
# 点击后等待一下,让弹窗消失或下一轮广告弹出
await asyncio.sleep(2.0)
else:
logger.info(f"[广告清理] 第 {round_idx}未发现广告弹窗。VLM 理由: {res.get('reason', '')}")
if os.path.exists(ad_screen): os.remove(ad_screen)
break
except Exception as e:
logger.error(f"[广告清理] 第 {round_idx} 轮检测发生异常: {e}")
finally:
if os.path.exists(ad_screen): os.remove(ad_screen)
logger.info("--- [广告清理] 任务结束 ---")
async def crawl_list_logic(self, d):
# 1. 启动即清理广告 (已根据要求关闭)
# await self.clear_ads(d, max_rounds=3)
# [优化] 向下滚动以刷新/校准地理位置
# 使用更加显式的 swipe 方式:从屏幕 30% 划到 80%
popup_screen_path = take_screenshot(d, f"tld_detail_popup_{int(time.time())}.jpg")
logger.info(f"[详情页] 截图用于检测温馨提示弹窗: {popup_screen_path}")
# 使用 OCR 探测“下次再说”按钮 (替代原来的模板匹配)
ocr_res = detect_warm_popup_xczs_cv(popup_screen_path)
ocr_point = None
if ocr_res:
w, h = d.window_size()
ocr_point = (
int((ocr_res[0] + ocr_res[2]) / 2 * w / 1000),
int((ocr_res[1] + ocr_res[3]) / 2 * h / 1000)
)
logger.info(f"[详情页] OCR 检测到“下次再说”按钮位置: {ocr_point}")
else:
logger.info("[详情页] OCR 未能检测到“下次再说”按钮")
vlm_popup = {"has_popup": False}
try:
vlm_popup = await self.read_image_kit.check_warm_popup_vlm(popup_screen_path)
except Exception as e:
logger.error(f"[详情页] VLM 检测温馨提示弹窗失败: {e}")
has_vlm_popup = isinstance(vlm_popup, dict) and vlm_popup.get("has_popup")
if ocr_point and has_vlm_popup:
click_x, click_y = ocr_point
logger.info(f"[详情页] OCR 与 VLM 均确认存在温馨提示弹窗,即将点击“下次再说”按钮: ({click_x}, {click_y})")
debug_popup_path = popup_screen_path.replace(".jpg", f"_xczs_click_{click_x}_{click_y}.jpg")
try:
img_popup = read_image(popup_screen_path)
if img_popup is not None:
cv2.circle(img_popup, (click_x, click_y), 20, (0, 0, 255), -1)
save_image(debug_popup_path, img_popup)
logger.info(f"[详情页] 已生成温馨提示弹窗点击诊断图片: {debug_popup_path}")
except Exception as e:
logger.error(f"[详情页] 生成温馨提示弹窗诊断图片失败: {e}")
try:
d.click(click_x, click_y)
await asyncio.sleep(1.5)
except Exception as e:
logger.error(f"[详情页] 点击温馨提示“下次再说”失败: {e}")
else:
logger.info(f"[详情页] 温馨提示弹窗未通过双重确认OCR检测: {bool(ocr_point)} | VLM 检测: {vlm_popup}")
w, h = d.window_size()
logger.info(f"执行显式下拉刷新操作: (x={w//2}, y1={int(h*0.3)} -> y2={int(h*0.8)})")
d.swipe(w // 2, int(h * 0.3), w // 2, int(h * 0.8), duration=0.5)
logger.info(f"等待 {WAIT_AFTER_SCROLL} 秒确保位置校准和列表刷新完成...")
await asyncio.sleep(WAIT_AFTER_SCROLL)
max_to_crawl = 1 if FIRST_RUN_ONLY_ONE_STATION else MAX_STATIONS_COUNT
processed_count = 0
last_md5 = None
while processed_count < max_to_crawl:
# 1. 截图并分析
screenshot_path = take_screenshot(d, f"tld_list_{int(time.time())}.jpg")
# 检测是否滚动到底部
curr_md5 = get_image_content_md5(screenshot_path, top_ratio=SAFE_EXCLUDE_RATIO, bottom_ratio=BOTTOM_SAFE_EXCLUDE_RATIO)
if last_md5 == curr_md5:
logger.info("内容无变化,判定已到底部")
if os.path.exists(screenshot_path): os.remove(screenshot_path)
break
last_md5 = curr_md5
stations = await self.read_image_kit.analyze_station_list(screenshot_path)
if not stations:
# 检查是否意外退出了小程序
is_wrong_page = await self.check_wrong_page(d, screenshot_path)
if is_wrong_page:
logger.error("检测到已退出详情列表页(可能回到了搜索页),尝试重新进入...")
await self.open_app()
await asyncio.sleep(5)
continue
logger.info("本页未检测到场站,尝试滑动...")
d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
await asyncio.sleep(WAIT_AFTER_SCROLL)
continue
for station in stations:
if processed_count >= max_to_crawl:
break
name = station.get("name")
point = station.get("point")
if not name or not point:
continue
# [优化] 使用 Redis 进行跨运行去重
cleaned_name = clean_station_name(name)
redis_key = f"crawled:tld:{cleaned_name}"
if await self.redis_kit.get_data(redis_key):
logger.info(f"跳过已处理场站 (Redis): {name}")
continue
current_idx = processed_count + 1
remaining = max_to_crawl - current_idx
logger.info(f"--- [进度: {current_idx}/{max_to_crawl}, 剩余: {remaining}] 处理场站: {name} (坐标: {point}, 距离: {station.get('distance')}) ---")
# 点击进入详情
d.click(point[0], point[1])
logger.info(f"已点击场站 '{name}',等待 {WAIT_DETAIL_PAGE_LOAD}s 加载详情页...")
await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)
# 截图验证是否进入详情页
detail_check_path = take_screenshot(d, f"tld_detail_check_{int(time.time())}.jpg")
logger.info(f"详情页快照已保存: {detail_check_path}")
# 简单验证:如果标题包含 "我的卡券"、"优惠券"、"新人福利" 等,说明点错了
is_wrong_page = await self.check_wrong_page(d, detail_check_path)
if is_wrong_page:
logger.warning(f"检测到进入了错误页面,尝试返回列表...")
# 尝试点击左上角的返回箭头,如果没有,则执行系统 back
d.click(40, 70) # 特来电通常左上角有返回箭头
await asyncio.sleep(1.0)
d.press("back")
await asyncio.sleep(WAIT_BACK_TO_LIST)
if os.path.exists(detail_check_path): os.remove(detail_check_path)
continue
# 爬取详情
await self.crawl_detail_logic(d, station)
if os.path.exists(detail_check_path): os.remove(detail_check_path)
# 标记为已处理
await self.redis_kit.set_data(redis_key, "1", expire=REDIS_STATION_EXPIRE)
d.press("back")
await asyncio.sleep(WAIT_BACK_TO_LIST)
d.press("back")
await asyncio.sleep(WAIT_BACK_TO_LIST)
processed_count += 1
if FIRST_RUN_ONLY_ONE_STATION:
logger.info("已完成首个场站的全流程采集,根据配置退出爬取任务。")
if os.path.exists(screenshot_path):
os.remove(screenshot_path)
if self.pending_price_tasks:
logger.info(f"[收尾] 等待后台价格任务完成,共 {len(self.pending_price_tasks)} 个...")
try:
await asyncio.gather(*self.pending_price_tasks, return_exceptions=True)
finally:
self.pending_price_tasks.clear()
logger.info("[收尾] 后台价格任务已全部完成")
return
# 滑动到下一页
d.swipe_ext("up", scale=SCROLL_DISTANCE_RATIO)
await asyncio.sleep(WAIT_AFTER_SCROLL)
if os.path.exists(screenshot_path): os.remove(screenshot_path)
if self.pending_price_tasks:
logger.info(f"[收尾] 等待后台价格任务完成,共 {len(self.pending_price_tasks)} 个...")
try:
await asyncio.gather(*self.pending_price_tasks, return_exceptions=True)
finally:
self.pending_price_tasks.clear()
logger.info("[收尾] 后台价格任务已全部完成")
async def check_wrong_page(self, d, image_path):
"""
检查是否误触进入了错误的页面(如:我的卡券、活动页等)
"""
data = await self.read_image_kit.check_wrong_page_vlm(image_path)
is_detail = data.get("is_detail_page", True)
if not is_detail:
logger.warning(f"⚠️ 确认进入错误页面: {data.get('page_type')} ({data.get('reason')})")
return not is_detail
async def crawl_detail_logic(self, d, station_info):
"""
在详情页提取价格和状态信息
"""
first_screen_path = take_screenshot(d, f"tld_detail_basic_{int(time.time())}.jpg")
station_name = station_info.get("name")
address = station_info.get("address")
distance = station_info.get("distance")
total_piles = None
free_piles = None
piles_detail = None
parking_info = None
submitted_price_task = False
logger.info(f"[详情页] 进入 crawl_detail_logic场站: {station_name} | 地址: {address}")
logger.info(f"[详情页] 已截取首屏截图,准备识别基础信息: {first_screen_path}")
try:
basic_info = await self.read_image_kit.analyze_detail_basic_info(first_screen_path)
if isinstance(basic_info, dict):
if basic_info.get("name"):
station_name = basic_info.get("name")
if basic_info.get("address"):
address = basic_info.get("address")
# 提取电桩信息
total_piles = basic_info.get("total_piles")
free_piles = basic_info.get("free_piles")
piles_detail = basic_info.get("piles_detail")
parking_info = basic_info.get("parking_info")
if total_piles is None and isinstance(station_info, dict):
tp = station_info.get("total_piles")
if tp is not None:
total_piles = tp
if free_piles is None and isinstance(station_info, dict):
fp = station_info.get("free_piles")
if fp is not None:
free_piles = fp
logger.info(f"[详情页] 基础信息识别结果: {station_name} | {address} | 桩数: {total_piles}/{free_piles} | 停车费: {parking_info}")
except Exception as ex:
logger.error(f"[详情页] 同步分析详情页基础信息失败: {ex}")
finally:
if os.path.exists(first_screen_path):
try:
os.remove(first_screen_path)
except:
pass
w, h = d.window_size()
logger.info("[详情页] 根据用户要求:进入页面后多等会,然后通过 OCR 实时探测价格入口 (全部时段/全天价格统一)")
w, h = d.window_size()
# 1. 增加等待时间,确保页面加载完成
logger.info(f"[详情页] 等待 {WAIT_DETAIL_PAGE_LOAD}s 确保页面稳定...")
await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)
# --- [优化] 小步快跑滚动查找逻辑 ---
# 目标:通过多次小幅度滚动 + OCR 实时探测,精准捕获价格入口(“全部时段”或“全天价格统一”),避免滑过头
max_scroll_attempts = 6 # 最大滚动尝试次数
scroll_step_ratio = 0.3 # 每次滚动的步长(屏幕高度的 30%
found_entry = None
logger.info(f"[详情页] 开始“小步快跑”滚动查找价格入口 (全部时段/全天价格统一) (最多 {max_scroll_attempts} 次)...")
for i in range(max_scroll_attempts):
# 1. 截图识别当前屏
curr_screen = take_screenshot(d, f"tld_scroll_ocr_{i}_{int(time.time())}.jpg")
# 2. 尝试 OCR 识别价格入口
entry_data = await self.read_image_kit.find_price_entrance_ocr(curr_screen)
if entry_data.get("found"):
logger.info(f"[详情页] 第 {i+1} 次尝试:成功探测到价格入口!")
found_entry = {
"screen": curr_screen,
"point": entry_data["point"]
}
break
# 3. 如果没找到,小幅向上滚动一段距离
if i < max_scroll_attempts - 1:
logger.info(f"[详情页] 第 {i+1} 次尝试未找到,小幅向上滚动 (步长: {scroll_step_ratio*100}%)...")
d.swipe(w // 2, int(h * 0.7), w // 2, int(h * (0.7 - scroll_step_ratio)), duration=0.5)
await asyncio.sleep(1.5) # 滚动后短暂停留
# 清理过程截图
if os.path.exists(curr_screen):
try: os.remove(curr_screen)
except: pass
entrance_clicked = False
try:
if found_entry:
price_tab_screen = found_entry["screen"]
p = found_entry["point"]
# 2. 点击价格入口
entry_x = int(p[0] * w / 1000)
entry_y = int(p[1] * h / 1000)
# 安全校验
if entry_y > h * 0.9:
logger.warning(f"[详情页] 入口坐标偏低 ({entry_y}),可能在底部遮罩,尝试微调。")
# 绘制最终点击诊断图
debug_click_path = price_tab_screen.replace(".jpg", "_final_click.jpg")
img = read_image(price_tab_screen)
if img is not None:
cv2.circle(img, (entry_x, entry_y), 25, (0, 255, 0), -1)
save_image(debug_click_path, img)
logger.info(f"[详情页] 已生成最终点击诊断图: {debug_click_path}")
logger.info(f"[详情页] 正在点击电价入口: ({entry_x}, {entry_y})")
d.click(entry_x, entry_y)
entrance_clicked = True
await asyncio.sleep(WAIT_DETAIL_PAGE_LOAD)
else:
# 按照用户要求:找不到文字时输出日志、截图并停止程序
fail_screen = take_screenshot(d, f"tld_ocr_fail_{int(time.time())}.jpg")
logger.error(f"❌ [OCR失败] 经过 {max_scroll_attempts} 次滚动仍未在页面中找到价格入口文字 (全部时段/全天价格统一)")
logger.error(f"❌ [OCR失败] 最终截图已保存至: {fail_screen}")
logger.error("❌ [OCR失败] 将不进入电价页,继续以当前信息写入基础数据。")
except Exception as e:
logger.error(f"[详情页] 识别或点击价格入口失败: {e}")
if entrance_clicked:
entered_price_path = take_screenshot(d, f"tld_detail_price_after_enter_{int(time.time())}.jpg")
logger.info(f"[电价页] 入口点击后的电价页截图已保存: {entered_price_path}")
await asyncio.sleep(1.0)
# 1. 向上滚动到顶部(不断下拉直到看到最上面的 00:00
logger.info("正在向上滚动价格列表到顶部 (快速多次滚动以尽快看到 00:00)...")
max_scroll_up_to_top = 10
for i in range(max_scroll_up_to_top):
before_scroll_path = take_screenshot(d, f"scroll_up_{i}.jpg")
before_scroll_md5 = get_image_content_md5(before_scroll_path)
d.swipe_ext("down", scale=0.85)
await asyncio.sleep(0.5)
after_scroll_path = take_screenshot(d, f"scroll_up_after_{i}.jpg")
after_scroll_md5 = get_image_content_md5(after_scroll_path)
if os.path.exists(before_scroll_path): os.remove(before_scroll_path)
if os.path.exists(after_scroll_path): os.remove(after_scroll_path)
if before_scroll_md5 == after_scroll_md5:
logger.info(f"价格列表已到达顶部 (滚动次数: {i})")
break
# 2. 从顶部开始向下逐页截图
logger.info("正在从顶部开始向下逐页截图...")
price_screenshots = []
max_scroll_down_pages = 8
for p_idx in range(1, max_scroll_down_pages + 1):
# 截图当前页
p_shot = take_screenshot(d, f"tld_detail_price_{int(time.time())}_{p_idx}.jpg")
# 检查是否还能向下滚动
before_dn_md5 = get_image_content_md5(p_shot)
d.swipe_ext("up", scale=0.8)
await asyncio.sleep(1.2)
# 检查是否还有新内容
check_dn_path = take_screenshot(d, f"check_dn_{p_idx}.jpg")
after_dn_md5 = get_image_content_md5(check_dn_path)
if os.path.exists(check_dn_path): os.remove(check_dn_path)
price_screenshots.append(p_shot)
if before_dn_md5 == after_dn_md5:
logger.info(f"价格列表已到达底部 (共抓取页数: {p_idx})")
break
if price_screenshots:
task = asyncio.create_task(
self._analyze_and_save_prices_async(
station_name=station_name,
address=address,
distance=distance,
price_screenshots=price_screenshots,
total_piles=total_piles,
free_piles=free_piles,
piles_detail=piles_detail,
parking_info=parking_info
)
)
self.pending_price_tasks.append(task)
submitted_price_task = True
logger.info(f"[详情页] 已后台提交 {len(price_screenshots)} 张电价截图进行识别与保存,继续后续流程不阻塞。")
else:
pass
if not submitted_price_task:
try:
await self.service.save_station_profile_and_status(
station_name=station_name,
address=address,
total_piles=total_piles,
free_piles=free_piles,
piles_detail=piles_detail,
parking_info=parking_info,
distance=distance
)
logger.info(f"[详情页] 已基于整合信息写入基础数据: {station_name}")
except Exception as e:
logger.error(f"[详情页] 写入基础数据失败: {e}")
async def crawl_list(self):
"""
实现 BaseCrawler 的抽象方法
"""
d = u2.connect()
await self.crawl_list_logic(d)
async def crawl_detail(self, station_info):
"""
实现 BaseCrawler 的抽象方法
"""
# 逻辑已在 crawl_list_logic 中通过 crawl_detail_logic 调用
pass
async def _analyze_and_save_prices_async(self, station_name, address, distance, price_screenshots, total_piles=None, free_piles=None, piles_detail=None, parking_info=None):
all_prices = []
sem = asyncio.Semaphore(self.vlm_concurrency)
async def analyze_one(path):
try:
async with sem:
prices = await self.read_image_kit.analyze_detail_price_info(path)
return path, prices
except Exception as e:
logger.error(f"[详情页] 异步识别价格失败 ({os.path.basename(path)}): {e}")
return path, []
try:
tasks = [analyze_one(p) for p in price_screenshots]
results = await asyncio.gather(*tasks, return_exceptions=False)
for path, prices in results:
if prices:
all_prices.extend(prices)
if os.path.exists(path):
try: os.remove(path)
except: pass
if not all_prices:
logger.warning(f"[详情页] {station_name} 后台识别未提取到任何价格信息")
return
unique_prices = []
seen_periods = set()
for p in all_prices:
key = f"{p.get('start')}-{p.get('end')}"
if key not in seen_periods:
unique_prices.append(p)
seen_periods.add(key)
unique_prices.sort(key=lambda x: x.get("start", "00:00"))
await self.service.save_station_data(
station_name=station_name,
address=address,
prices=unique_prices,
total_piles=total_piles,
free_piles=free_piles,
piles_detail=piles_detail,
parking_info=parking_info,
distance=distance,
)
logger.info(f"[详情页] {station_name} 后台价格信息处理完成,共 {len(unique_prices)} 条时段,并已写入数据库。")
except Exception as e:
logger.error(f"[详情页] 后台处理价格截图失败: {e}")
finally:
for p_shot in price_screenshots:
if os.path.exists(p_shot):
try: os.remove(p_shot)
except: pass
async def main(service=None):
crawler = TeLaiDianCrawler(service=service)
await crawler.start()
if __name__ == "__main__":
asyncio.run(main())