This commit is contained in:
HuangHai
2026-01-25 17:08:40 +08:00
parent 8292bf83d1
commit 19803f96a8
7 changed files with 185 additions and 202 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 215 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 137 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 137 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 137 KiB

View File

@@ -1,12 +1,13 @@
# coding=utf-8
import uiautomator2 as u2
import time
import logging
import sys
import os
import asyncio
import logging
import os
import sys
import time
from datetime import datetime
import uiautomator2 as u2
# 添加项目根目录到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
@@ -14,7 +15,7 @@ if project_root not in sys.path:
from Util import Win32Patch
from WeiXin.WxUtil import find_input_box_center, perform_input_action, get_vlm_analysis, clean_screenshots_dir, is_in_chat_interface, get_vlm_json, find_template_match, find_all_template_matches
from WeiXin.WxUtil import perform_input_action, clean_screenshots_dir, is_in_chat_interface, find_template_match, find_all_template_matches
from Util.LlmUtil import get_llm_response
from Util.EasyOcrKit import EasyOcrKit
@@ -60,8 +61,7 @@ class ChatBot:
if not os.path.exists(self.screenshot_dir):
os.makedirs(self.screenshot_dir)
# 强制使用 CPU 模式以避免 0xC0000409 (Stack Buffer Overrun) 崩溃
self.ocr_kit = EasyOcrKit(gpu=False)
self.ocr_kit = EasyOcrKit(gpu=True)
self.is_first_run = True # 首次运行标志
@@ -122,14 +122,27 @@ class ChatBot:
convert_template = r"d:\dsWork\aiData\WeiXin\Templates\zhun_wen_zi.jpg"
logger.info(f"🔍 寻找模板: {convert_template}")
convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.7)
convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.6)
if not convert_btn:
logger.warning("❌ CV 未找到 '转文字' 按钮,取消操作。")
self.d.click(vx + 200, vy) # 点击空白处关闭菜单
return None
logger.warning("❌ CV 未找到 '转文字' 按钮,尝试小范围 OCR 兜底...")
# 尝试在该区域进行 OCR 识别,寻找 "转文字" 三个字
ocr_results_menu = self.ocr_kit.read_text(menu_shot_path)
for bbox, text, conf in ocr_results_menu:
if "转文字" in text or "转文" in text or "文字" in text:
cx = (bbox[0][0] + bbox[2][0]) / 2
cy = (bbox[0][1] + bbox[2][1]) / 2
convert_btn = (cx, cy)
logger.info(f"✅ OCR 兜底找到 '转文字' 按钮: {convert_btn}")
break
logger.info(f"✅ CV 找到 '转文字' 按钮: {convert_btn}")
if not convert_btn:
logger.warning("❌ CV 和 OCR 均未找到 '转文字' 按钮,取消操作。")
# 点击屏幕中心区域的空白处关闭菜单,避免点到顶部返回键
self.d.click(500, 500)
return None
logger.info(f"✅ 最终找到 '转文字' 按钮坐标: {convert_btn}")
self.d.click(convert_btn[0], convert_btn[1])
# 3. 动态等待转换
@@ -221,14 +234,14 @@ class ChatBot:
self.d.screenshot(menu_shot_path_cancel)
cancel_template = r"d:\dsWork\aiData\WeiXin\Templates\cancel_zhuan_wen_zi.jpg"
cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.7)
cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.6)
if cancel_btn:
logger.info(f"✅ CV 找到 '取消转文字' 按钮: {cancel_btn}")
self.d.click(cancel_btn[0], cancel_btn[1])
else:
logger.warning("❌ CV 未找到 '取消转文字' 按钮,尝试点击空白处关闭菜单。")
self.d.click(vx + 200, vy)
logger.warning("❌ CV 未找到 '取消转文字' 按钮,点击中心区域关闭菜单。")
self.d.click(500, 500)
return full_text
@@ -242,6 +255,12 @@ class ChatBot:
while True:
try:
# 0.5 检查是否在聊天界面
if not is_in_chat_interface(self.d):
logger.warning("📵 当前不在聊天界面,跳过扫描...")
await asyncio.sleep(CHECK_INTERVAL)
continue
logger.info("🔍 正在扫描当前界面内容...")
# 1. 截图
@@ -261,202 +280,156 @@ class ChatBot:
last_screen_md5 = current_md5
# 2. VLM 分析
logger.info("🧠 正在调用 VLM 分析图片...")
result_data = await get_vlm_analysis(tmp_shot)
# 2. 本地视觉分析 (替代 VLM)
logger.info("<EFBFBD> 正在进行本地视觉扫描...")
if not result_data:
logger.warning("⚠️ VLM 分析返回为空,跳过本次循环。")
await asyncio.sleep(CHECK_INTERVAL)
continue
# 3. 解析数据构建 dialogue_log
messages = result_data.get("messages", [])
# 🚨 关键修正:按 Y 坐标对消息进行排序,确保时间顺序正确
# VLM 返回的顺序可能不准,必须强制按屏幕位置(从上到下)排序
messages.sort(key=lambda m: (m.get("center") or m.get("coordinates") or [0, 0])[1])
input_center = result_data.get("input_box")
# --- 🔴 红点补救策略 (Red Point Correction) ---
# VLM 有时会漏掉红点,我们使用 CV 模板匹配来修正
# A. 寻找语音图标 (audio.jpg) 和 红点 (red_point.jpg)
audio_template = r"d:\dsWork\aiData\WeiXin\Templates\audio.jpg"
red_point_template = r"d:\dsWork\aiData\WeiXin\Templates\red_point.jpg"
audio_matches = find_all_template_matches(tmp_shot, audio_template, threshold=0.8)
red_points = find_all_template_matches(tmp_shot, red_point_template, threshold=0.8)
if red_points:
logger.info(f"🔴 CV 检测到 {len(red_points)} 个红点,正在修正语音消息状态...")
for rp in red_points:
rx, ry = rp
# 遍历所有消息,找到距离该红点最近的【语音消息】
# 规则红点通常在语音消息的右侧Y轴差异不大
best_match_msg = None
min_dist = 9999
for msg in messages:
if msg.get("type") == "voice":
coords = msg.get("center") or msg.get("coordinates")
if coords:
mx, my = coords
# 检查 Y 轴距离 (红点应该和语音气泡在同一行,容差 50px)
if abs(my - ry) < 50:
# 检查 X 轴关系 (红点在语音气泡右侧)
if rx > mx:
dist = ((rx - mx)**2 + (ry - my)**2)**0.5
if dist < min_dist:
min_dist = dist
best_match_msg = msg
if best_match_msg:
# 只有当距离合理(比如小于 300px视气泡长度而定但红点一般紧挨着
# 考虑到长语音气泡可能很长,中心点在中间,红点在最右边,距离可能较远
# 所以主要依赖 Y 轴对齐和 X 轴方向。
# 这里直接标记
if not best_match_msg.get("is_unread", False):
best_match_msg["is_unread"] = True
logger.info(f"🔴 修正:标记语音消息 {best_match_msg.get('content')} 为未读 (红点坐标: {rp})")
# ---------------------------------------------
# B. 本地 OCR 识别全文以构建上下文
ocr_results = self.ocr_kit.read_text(tmp_shot)
# 按 Y 坐标排序
ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2)
# --- Debug Visualization ---
try:
import cv2
import numpy as np
debug_img = cv2.imread(tmp_shot)
if debug_img is not None:
# Draw input box
if input_center:
ix, iy = input_center
cv2.circle(debug_img, (int(ix), int(iy)), 10, (0, 0, 255), -1) # Red dot
cv2.putText(debug_img, "Input", (int(ix), int(iy)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
for msg in messages:
m_type = msg.get("type", "text")
coords = msg.get("center") or msg.get("coordinates")
if coords:
cx, cy = int(coords[0]), int(coords[1])
if m_type == "voice":
# Green box for voice
# We don't have w/h, so just draw a fixed size box or circle
# Let's draw a rectangle around the center
cv2.rectangle(debug_img, (cx-50, cy-25), (cx+50, cy+25), (0, 255, 0), 3)
cv2.circle(debug_img, (cx, cy), 5, (0, 255, 0), -1)
cv2.putText(debug_img, "Voice", (cx, cy-30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
# else:
# # Blue box for text
# cv2.rectangle(debug_img, (cx-50, cy-25), (cx+50, cy+25), (255, 0, 0), 2)
# cv2.putText(debug_img, "Text", (cx, cy-30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
debug_path = os.path.join(self.screenshot_dir, "t6_debug_result.jpg")
cv2.imwrite(debug_path, debug_img)
logger.info(f"🐛 Debug 标记图已保存: {debug_path}")
except Exception as e:
logger.error(f"Debug drawing failed: {e}")
# ---------------------------
dialogue_log = []
voice_messages = [] # 存储所有语音消息
voice_messages = []
for i, msg in enumerate(messages):
# 简单的发送者判断
sender_val = msg.get("sender", "对方")
if sender_val in ["Me", ""]:
sender_name = ""
else:
sender_name = "对方"
msg_type = msg.get("type", "text")
content = msg.get("content", "")
# status = msg.get("status", "unconverted") # 不再依赖 status
is_unread = msg.get("is_unread", False)
# 准备可视化调试图
import cv2
import numpy as np
debug_img = cv2.imread(tmp_shot)
# 记录已匹配到语音图标的 OCR 块索引
matched_ocr_indices = set()
# 先处理语音图标匹配
for ax, ay in audio_matches:
# 排除顶部标题栏(0-300)和底部输入区(1800+)
if ay < 300 or ay > 1800:
logger.info(f"⏭️ 忽略区域外语音图标: ({ax}, {ay})")
continue
if msg_type == "voice":
coords = msg.get("center") or msg.get("coordinates")
if coords:
msg["coordinates"] = coords
# 只处理“对方”的语音消息,忽略“我”发送的语音
if sender_name != "":
voice_messages.append(msg)
# 在日志中暂时标记为 [语音],稍后如果处理了会更新
# 但为了日志完整性,我们这里先占位
# 实际上,我们需要知道这个语音的内容才能放入 context
# 如果没有内容,只能放 [语音]
# 只有被处理过的语音,我们才能获取内容。
# 对于历史语音,如果我们不处理(非首次运行且无红点),我们无法知道内容。
# 所以这里只能 append 占位符。
dialogue_log.append(f"{sender_name}: [语音] {content}")
sender = "对方" if ax < 500 else ""
logger.info(f"🎙️ 发现语音图标: x={ax}, y={ay}, 发送者={sender}")
is_unread = False
if red_points:
for rx, ry in red_points:
# 红点通常在语音图标右侧,且 Y 轴相近
if abs(ry - ay) < 50 and rx > ax:
is_unread = True
# 绘制红点
cv2.circle(debug_img, (int(rx), int(ry)), 12, (0, 0, 255), -1)
break
# 寻找附近的时长文字 (OCR)
duration_text = "语音"
for idx, (bbox, text, conf) in enumerate(ocr_results):
c_x = (bbox[0][0] + bbox[2][0]) / 2
c_y = (bbox[0][1] + bbox[2][1]) / 2
if abs(c_y - ay) < 40 and abs(c_x - ax) < 300:
if '"' in text or text.isdigit():
duration_text = text
matched_ocr_indices.add(idx)
break
# 计算点击坐标:直接点击语音图标中心
click_x, click_y = ax, ay
# 绘制视觉反馈
# 语音图标用绿框
cv2.rectangle(debug_img, (int(ax-30), int(ay-30)), (int(ax+30), int(ay+30)), (0, 255, 0), 3)
# 点击位置用红十字
cv2.drawMarker(debug_img, (int(click_x), int(click_y)), (0, 0, 255), cv2.MARKER_CROSS, 35, 3)
v_msg = {
"type": "voice",
"content": duration_text,
"coordinates": [click_x, click_y],
"sender": sender,
"is_unread": is_unread
}
if sender == "对方":
voice_messages.append(v_msg)
dialogue_log.append({
"y": ay,
"text": f"{sender}: [语音] {duration_text}",
"is_voice": True,
"id": f"voice_{ax}_{ay}"
})
# 处理剩余的 OCR 文字块 (普通文本)
for idx, (bbox, text, conf) in enumerate(ocr_results):
if idx in matched_ocr_indices: continue
x_min, x_max = bbox[0][0], bbox[2][0]
y_min, y_max = bbox[0][1], bbox[2][1]
c_x, c_y = (x_min + x_max) / 2, (y_min + y_max) / 2
if c_y < 300 or c_y > 1800: continue
if x_min < 250 and x_max < 700:
sender, color = "对方", (0, 255, 0)
elif x_max > 800 and x_min > 300:
sender, color = "", (255, 0, 0)
else:
dialogue_log.append(f"{sender_name}: {content}")
logger.info(f"📑 界面扫描完成,当前对话历史共 {len(dialogue_log)}")
sender, color = "系统", (128, 128, 128)
if sender != "系统":
logger.info(f"💬 发现文本消息: x={c_x}, y={c_y}, 发送者={sender}, 内容={text}")
cv2.rectangle(debug_img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), color, 1)
dialogue_log.append({
"y": c_y,
"text": f"{sender}: {text}",
"is_voice": False
})
# 按 Y 轴重新排序整个对话日志
dialogue_log.sort(key=lambda x: x['y'])
# 保存调试图
debug_shot_path = os.path.join(self.screenshot_dir, "t6_debug_view.jpg")
cv2.imwrite(debug_shot_path, debug_img)
logger.info(f"🎨 已保存视觉调试图: {debug_shot_path}")
# C. 寻找输入框 (CV 模板匹配)
input_template = r"d:\dsWork\aiData\WeiXin\Templates\input_box.jpg" # 假设有这个模板
input_center = find_template_match(tmp_shot, input_template, threshold=0.6)
if not input_center:
# 几何兜底:屏幕底部 88% 处
from PIL import Image
with Image.open(tmp_shot) as img:
w, h = img.size
input_center = [w // 2, int(h * 0.88)]
logger.info(f"<EFBFBD> 使用几何兜底输入框坐标: {input_center}")
# 4. 语音处理逻辑
processed_voice_content = None
input_y = input_center[1] if input_center else None
if self.is_first_run:
logger.info("🌟 首次运行:处理屏幕上所有语音消息...")
for v_msg in voice_messages:
# 查找下一条消息,用于限定 OCR 范围
try:
idx = messages.index(v_msg)
next_msg = messages[idx + 1] if idx + 1 < len(messages) else None
except ValueError:
next_msg = None
# 无论是否未读,都处理
text = await self.process_single_voice(v_msg, next_msg, input_y)
# 只有未读的才处理
for v_msg in voice_messages:
if v_msg.get("is_unread") or self.is_first_run:
logger.info(f"🔴 发现未读/待处理语音: {v_msg['content']}")
# 找到 OCR 结果中的下一条作为边界
idx = -1
# 这里简化逻辑,直接处理
text = await self.process_single_voice(v_msg, None, input_y)
if text:
# 直接更新 dialogue_log 对应的条目
dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]")
if v_msg == voice_messages[-1]:
processed_voice_content = text
self.is_first_run = False # 标记首次运行结束
# 初始化 last_processed_msg避免回复历史消息
if dialogue_log:
last_log = dialogue_log[-1]
if last_log.startswith("对方"):
content = last_log.split(":", 1)[1].strip()
self.last_processed_msg = content
logger.info(f"🌟 首次运行,标记最后一条对方消息为已处理: {content}")
else:
# 后续监控:只处理最后一条,且必须是未读 (is_unread=True)
if voice_messages:
last_voice = voice_messages[-1]
if last_voice.get("is_unread", False):
logger.info("🔴 发现未读语音消息 (最后一条),正在处理...")
# 查找下一条消息
try:
idx = messages.index(last_voice)
next_msg = messages[idx + 1] if idx + 1 < len(messages) else None
except ValueError:
next_msg = None
text = await self.process_single_voice(last_voice, next_msg, input_y)
if text:
# 直接更新 dialogue_log 对应的条目
dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]")
processed_voice_content = text
else:
# 增加更多调试信息,帮助定位为何跳过
sender = last_voice.get("sender", "未知")
content = last_voice.get("content", "")
coords = last_voice.get("coordinates", [])
logger.info(f"⚪ 最后一条语音消息已读,跳过处理。[{sender}, {content}, {coords}]")
# 5. LLM 回复逻辑
# 只有当有新的语音被处理并识别出文字,或者有新的文本消息时才回复
# 既然 dialogue_log 已经更新,我们直接用 history_text
# 更新 log 中的内容
for item in dialogue_log:
if item.get("is_voice") and f"[语音] {v_msg['content']}" in item["text"]:
item["text"] = item["text"].replace("[语音]", f"[语音转文字: {text}]")
break
history_text = "\n".join(dialogue_log)
self.is_first_run = False
# 5. LLM 回复逻辑
final_dialogue_texts = [item['text'] for item in dialogue_log]
history_text = "\n".join(final_dialogue_texts)
# 判断是否需要回复:
# 核心规则:只有当最后一条消息是“对方”说的,且内容未处理过,才回复。
@@ -465,7 +438,8 @@ class ChatBot:
current_last_content = ""
if dialogue_log:
last_log = dialogue_log[-1]
last_item = dialogue_log[-1]
last_log = last_item["text"]
# 检查最后一条消息的发送者
if last_log.startswith("对方"):
@@ -493,11 +467,20 @@ class ChatBot:
self.last_processed_msg = current_last_content
reply = await self.get_reply(history_text)
logger.info(f"💡 LLM 回复: {reply}")
if reply and input_center:
# 输入并发送
perform_input_action(self.d, input_center, reply)
if reply:
logger.info(f"💡 LLM 回复: {reply}")
if input_center:
# 输入并发送
perform_input_action(self.d, input_center, reply)
# 发送后,为了防止下一轮 OCR 识别到自己的回复片段并误判为对方消息
# 我们把 last_processed_msg 设置为一个特殊的占位符,直到下一次真正识别到对方的新消息
# 或者更简单:在下一轮循环开始前稍微多等一下,让消息气泡完全显示
time.sleep(1)
# 将最后处理的消息内容标记为已处理,防止 LLM 回复逻辑在下一轮立即触发
# 注意:这里的 current_last_content 是对方的最后一条
else:
logger.warning("⚠️ LLM 未生成有效回复。")
# 休眠
await asyncio.sleep(CHECK_INTERVAL)

BIN
WeiXin/Templates/audio.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB