diff --git a/WeiXin/Screenshots/t6_debug_result.jpg b/WeiXin/Screenshots/t6_debug_result.jpg deleted file mode 100644 index b702a33..0000000 Binary files a/WeiXin/Screenshots/t6_debug_result.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/t6_menu_shot_cancel.jpg b/WeiXin/Screenshots/t6_menu_shot_cancel.jpg deleted file mode 100644 index 6194cb0..0000000 Binary files a/WeiXin/Screenshots/t6_menu_shot_cancel.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/t6_menu_shot_convert.jpg b/WeiXin/Screenshots/t6_menu_shot_convert.jpg deleted file mode 100644 index 6194cb0..0000000 Binary files a/WeiXin/Screenshots/t6_menu_shot_convert.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/t6_monitor_temp.jpg b/WeiXin/Screenshots/t6_monitor_temp.jpg deleted file mode 100644 index b4634ca..0000000 Binary files a/WeiXin/Screenshots/t6_monitor_temp.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/t6_ocr_shot.jpg b/WeiXin/Screenshots/t6_ocr_shot.jpg deleted file mode 100644 index 8ba240e..0000000 Binary files a/WeiXin/Screenshots/t6_ocr_shot.jpg and /dev/null differ diff --git a/WeiXin/T6_AutoChatMonitor.py b/WeiXin/T6_AutoChatMonitor.py index 8e0a2a1..9cc2c56 100644 --- a/WeiXin/T6_AutoChatMonitor.py +++ b/WeiXin/T6_AutoChatMonitor.py @@ -1,12 +1,13 @@ # coding=utf-8 -import uiautomator2 as u2 -import time -import logging -import sys -import os import asyncio +import logging +import os +import sys +import time from datetime import datetime +import uiautomator2 as u2 + # 添加项目根目录到 sys.path project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if project_root not in sys.path: @@ -14,7 +15,7 @@ if project_root not in sys.path: from Util import Win32Patch -from WeiXin.WxUtil import find_input_box_center, perform_input_action, get_vlm_analysis, clean_screenshots_dir, is_in_chat_interface, get_vlm_json, find_template_match, find_all_template_matches +from WeiXin.WxUtil import perform_input_action, clean_screenshots_dir, is_in_chat_interface, find_template_match, find_all_template_matches from Util.LlmUtil import get_llm_response from Util.EasyOcrKit import EasyOcrKit @@ -60,8 +61,7 @@ class ChatBot: if not os.path.exists(self.screenshot_dir): os.makedirs(self.screenshot_dir) - # 强制使用 CPU 模式以避免 0xC0000409 (Stack Buffer Overrun) 崩溃 - self.ocr_kit = EasyOcrKit(gpu=False) + self.ocr_kit = EasyOcrKit(gpu=True) self.is_first_run = True # 首次运行标志 @@ -122,14 +122,27 @@ class ChatBot: convert_template = r"d:\dsWork\aiData\WeiXin\Templates\zhun_wen_zi.jpg" logger.info(f"🔍 寻找模板: {convert_template}") - convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.7) + convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.6) if not convert_btn: - logger.warning("❌ CV 未找到 '转文字' 按钮,取消操作。") - self.d.click(vx + 200, vy) # 点击空白处关闭菜单 - return None + logger.warning("❌ CV 未找到 '转文字' 按钮,尝试小范围 OCR 兜底...") + # 尝试在该区域进行 OCR 识别,寻找 "转文字" 三个字 + ocr_results_menu = self.ocr_kit.read_text(menu_shot_path) + for bbox, text, conf in ocr_results_menu: + if "转文字" in text or "转文" in text or "文字" in text: + cx = (bbox[0][0] + bbox[2][0]) / 2 + cy = (bbox[0][1] + bbox[2][1]) / 2 + convert_btn = (cx, cy) + logger.info(f"✅ OCR 兜底找到 '转文字' 按钮: {convert_btn}") + break - logger.info(f"✅ CV 找到 '转文字' 按钮: {convert_btn}") + if not convert_btn: + logger.warning("❌ CV 和 OCR 均未找到 '转文字' 按钮,取消操作。") + # 点击屏幕中心区域的空白处关闭菜单,避免点到顶部返回键 + self.d.click(500, 500) + return None + + logger.info(f"✅ 最终找到 '转文字' 按钮坐标: {convert_btn}") self.d.click(convert_btn[0], convert_btn[1]) # 3. 动态等待转换 @@ -221,14 +234,14 @@ class ChatBot: self.d.screenshot(menu_shot_path_cancel) cancel_template = r"d:\dsWork\aiData\WeiXin\Templates\cancel_zhuan_wen_zi.jpg" - cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.7) + cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.6) if cancel_btn: logger.info(f"✅ CV 找到 '取消转文字' 按钮: {cancel_btn}") self.d.click(cancel_btn[0], cancel_btn[1]) else: - logger.warning("❌ CV 未找到 '取消转文字' 按钮,尝试点击空白处关闭菜单。") - self.d.click(vx + 200, vy) + logger.warning("❌ CV 未找到 '取消转文字' 按钮,点击中心区域关闭菜单。") + self.d.click(500, 500) return full_text @@ -242,6 +255,12 @@ class ChatBot: while True: try: + # 0.5 检查是否在聊天界面 + if not is_in_chat_interface(self.d): + logger.warning("📵 当前不在聊天界面,跳过扫描...") + await asyncio.sleep(CHECK_INTERVAL) + continue + logger.info("🔍 正在扫描当前界面内容...") # 1. 截图 @@ -261,202 +280,156 @@ class ChatBot: last_screen_md5 = current_md5 - # 2. VLM 分析 - logger.info("🧠 正在调用 VLM 分析图片...") - result_data = await get_vlm_analysis(tmp_shot) + # 2. 本地视觉分析 (替代 VLM) + logger.info("�️ 正在进行本地视觉扫描...") - if not result_data: - logger.warning("⚠️ VLM 分析返回为空,跳过本次循环。") - await asyncio.sleep(CHECK_INTERVAL) - continue - - # 3. 解析数据构建 dialogue_log - messages = result_data.get("messages", []) - - # 🚨 关键修正:按 Y 坐标对消息进行排序,确保时间顺序正确 - # VLM 返回的顺序可能不准,必须强制按屏幕位置(从上到下)排序 - messages.sort(key=lambda m: (m.get("center") or m.get("coordinates") or [0, 0])[1]) - - input_center = result_data.get("input_box") - - # --- 🔴 红点补救策略 (Red Point Correction) --- - # VLM 有时会漏掉红点,我们使用 CV 模板匹配来修正 + # A. 寻找语音图标 (audio.jpg) 和 红点 (red_point.jpg) + audio_template = r"d:\dsWork\aiData\WeiXin\Templates\audio.jpg" red_point_template = r"d:\dsWork\aiData\WeiXin\Templates\red_point.jpg" + + audio_matches = find_all_template_matches(tmp_shot, audio_template, threshold=0.8) red_points = find_all_template_matches(tmp_shot, red_point_template, threshold=0.8) - if red_points: - logger.info(f"🔴 CV 检测到 {len(red_points)} 个红点,正在修正语音消息状态...") - for rp in red_points: - rx, ry = rp - # 遍历所有消息,找到距离该红点最近的【语音消息】 - # 规则:红点通常在语音消息的右侧,Y轴差异不大 - best_match_msg = None - min_dist = 9999 - - for msg in messages: - if msg.get("type") == "voice": - coords = msg.get("center") or msg.get("coordinates") - if coords: - mx, my = coords - # 检查 Y 轴距离 (红点应该和语音气泡在同一行,容差 50px) - if abs(my - ry) < 50: - # 检查 X 轴关系 (红点在语音气泡右侧) - if rx > mx: - dist = ((rx - mx)**2 + (ry - my)**2)**0.5 - if dist < min_dist: - min_dist = dist - best_match_msg = msg - - if best_match_msg: - # 只有当距离合理(比如小于 300px,视气泡长度而定,但红点一般紧挨着) - # 考虑到长语音气泡可能很长,中心点在中间,红点在最右边,距离可能较远 - # 所以主要依赖 Y 轴对齐和 X 轴方向。 - # 这里直接标记 - if not best_match_msg.get("is_unread", False): - best_match_msg["is_unread"] = True - logger.info(f"🔴 修正:标记语音消息 {best_match_msg.get('content')} 为未读 (红点坐标: {rp})") - # --------------------------------------------- - + # B. 本地 OCR 识别全文以构建上下文 + ocr_results = self.ocr_kit.read_text(tmp_shot) + # 按 Y 坐标排序 + ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2) - # --- Debug Visualization --- - try: - import cv2 - import numpy as np - debug_img = cv2.imread(tmp_shot) - if debug_img is not None: - # Draw input box - if input_center: - ix, iy = input_center - cv2.circle(debug_img, (int(ix), int(iy)), 10, (0, 0, 255), -1) # Red dot - cv2.putText(debug_img, "Input", (int(ix), int(iy)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2) - - for msg in messages: - m_type = msg.get("type", "text") - coords = msg.get("center") or msg.get("coordinates") - - if coords: - cx, cy = int(coords[0]), int(coords[1]) - if m_type == "voice": - # Green box for voice - # We don't have w/h, so just draw a fixed size box or circle - # Let's draw a rectangle around the center - cv2.rectangle(debug_img, (cx-50, cy-25), (cx+50, cy+25), (0, 255, 0), 3) - cv2.circle(debug_img, (cx, cy), 5, (0, 255, 0), -1) - cv2.putText(debug_img, "Voice", (cx, cy-30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) - # else: - # # Blue box for text - # cv2.rectangle(debug_img, (cx-50, cy-25), (cx+50, cy+25), (255, 0, 0), 2) - # cv2.putText(debug_img, "Text", (cx, cy-30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2) - - debug_path = os.path.join(self.screenshot_dir, "t6_debug_result.jpg") - cv2.imwrite(debug_path, debug_img) - logger.info(f"🐛 Debug 标记图已保存: {debug_path}") - except Exception as e: - logger.error(f"Debug drawing failed: {e}") - # --------------------------- - dialogue_log = [] - voice_messages = [] # 存储所有语音消息 + voice_messages = [] - for i, msg in enumerate(messages): - # 简单的发送者判断 - sender_val = msg.get("sender", "对方") - if sender_val in ["Me", "我"]: - sender_name = "我" - else: - sender_name = "对方" - - msg_type = msg.get("type", "text") - content = msg.get("content", "") - # status = msg.get("status", "unconverted") # 不再依赖 status - is_unread = msg.get("is_unread", False) + # 准备可视化调试图 + import cv2 + import numpy as np + debug_img = cv2.imread(tmp_shot) + + # 记录已匹配到语音图标的 OCR 块索引 + matched_ocr_indices = set() + + # 先处理语音图标匹配 + for ax, ay in audio_matches: + # 排除顶部标题栏(0-300)和底部输入区(1800+) + if ay < 300 or ay > 1800: + logger.info(f"⏭️ 忽略区域外语音图标: ({ax}, {ay})") + continue - if msg_type == "voice": - coords = msg.get("center") or msg.get("coordinates") - if coords: - msg["coordinates"] = coords - # 只处理“对方”的语音消息,忽略“我”发送的语音 - if sender_name != "我": - voice_messages.append(msg) - - # 在日志中暂时标记为 [语音],稍后如果处理了会更新 - # 但为了日志完整性,我们这里先占位 - # 实际上,我们需要知道这个语音的内容才能放入 context - # 如果没有内容,只能放 [语音] - # 只有被处理过的语音,我们才能获取内容。 - # 对于历史语音,如果我们不处理(非首次运行且无红点),我们无法知道内容。 - # 所以这里只能 append 占位符。 - dialogue_log.append(f"{sender_name}: [语音] {content}") + sender = "对方" if ax < 500 else "我" + logger.info(f"🎙️ 发现语音图标: x={ax}, y={ay}, 发送者={sender}") + is_unread = False + if red_points: + for rx, ry in red_points: + # 红点通常在语音图标右侧,且 Y 轴相近 + if abs(ry - ay) < 50 and rx > ax: + is_unread = True + # 绘制红点 + cv2.circle(debug_img, (int(rx), int(ry)), 12, (0, 0, 255), -1) + break + + # 寻找附近的时长文字 (OCR) + duration_text = "语音" + for idx, (bbox, text, conf) in enumerate(ocr_results): + c_x = (bbox[0][0] + bbox[2][0]) / 2 + c_y = (bbox[0][1] + bbox[2][1]) / 2 + if abs(c_y - ay) < 40 and abs(c_x - ax) < 300: + if '"' in text or text.isdigit(): + duration_text = text + matched_ocr_indices.add(idx) + break + + # 计算点击坐标:直接点击语音图标中心 + click_x, click_y = ax, ay + + # 绘制视觉反馈 + # 语音图标用绿框 + cv2.rectangle(debug_img, (int(ax-30), int(ay-30)), (int(ax+30), int(ay+30)), (0, 255, 0), 3) + # 点击位置用红十字 + cv2.drawMarker(debug_img, (int(click_x), int(click_y)), (0, 0, 255), cv2.MARKER_CROSS, 35, 3) + + v_msg = { + "type": "voice", + "content": duration_text, + "coordinates": [click_x, click_y], + "sender": sender, + "is_unread": is_unread + } + if sender == "对方": + voice_messages.append(v_msg) + dialogue_log.append({ + "y": ay, + "text": f"{sender}: [语音] {duration_text}", + "is_voice": True, + "id": f"voice_{ax}_{ay}" + }) + + # 处理剩余的 OCR 文字块 (普通文本) + for idx, (bbox, text, conf) in enumerate(ocr_results): + if idx in matched_ocr_indices: continue + + x_min, x_max = bbox[0][0], bbox[2][0] + y_min, y_max = bbox[0][1], bbox[2][1] + c_x, c_y = (x_min + x_max) / 2, (y_min + y_max) / 2 + + if c_y < 300 or c_y > 1800: continue + + if x_min < 250 and x_max < 700: + sender, color = "对方", (0, 255, 0) + elif x_max > 800 and x_min > 300: + sender, color = "我", (255, 0, 0) else: - dialogue_log.append(f"{sender_name}: {content}") - - logger.info(f"📑 界面扫描完成,当前对话历史共 {len(dialogue_log)} 条") + sender, color = "系统", (128, 128, 128) + + if sender != "系统": + logger.info(f"💬 发现文本消息: x={c_x}, y={c_y}, 发送者={sender}, 内容={text}") + cv2.rectangle(debug_img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), color, 1) + dialogue_log.append({ + "y": c_y, + "text": f"{sender}: {text}", + "is_voice": False + }) + + # 按 Y 轴重新排序整个对话日志 + dialogue_log.sort(key=lambda x: x['y']) + + # 保存调试图 + debug_shot_path = os.path.join(self.screenshot_dir, "t6_debug_view.jpg") + cv2.imwrite(debug_shot_path, debug_img) + logger.info(f"🎨 已保存视觉调试图: {debug_shot_path}") + + # C. 寻找输入框 (CV 模板匹配) + input_template = r"d:\dsWork\aiData\WeiXin\Templates\input_box.jpg" # 假设有这个模板 + input_center = find_template_match(tmp_shot, input_template, threshold=0.6) + if not input_center: + # 几何兜底:屏幕底部 88% 处 + from PIL import Image + with Image.open(tmp_shot) as img: + w, h = img.size + input_center = [w // 2, int(h * 0.88)] + logger.info(f"� 使用几何兜底输入框坐标: {input_center}") # 4. 语音处理逻辑 processed_voice_content = None input_y = input_center[1] if input_center else None - if self.is_first_run: - logger.info("🌟 首次运行:处理屏幕上所有语音消息...") - for v_msg in voice_messages: - # 查找下一条消息,用于限定 OCR 范围 - try: - idx = messages.index(v_msg) - next_msg = messages[idx + 1] if idx + 1 < len(messages) else None - except ValueError: - next_msg = None - - # 无论是否未读,都处理 - text = await self.process_single_voice(v_msg, next_msg, input_y) + # 只有未读的才处理 + for v_msg in voice_messages: + if v_msg.get("is_unread") or self.is_first_run: + logger.info(f"🔴 发现未读/待处理语音: {v_msg['content']}") + # 找到 OCR 结果中的下一条作为边界 + idx = -1 + # 这里简化逻辑,直接处理 + text = await self.process_single_voice(v_msg, None, input_y) if text: - # 直接更新 dialogue_log 对应的条目 - dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]") - - if v_msg == voice_messages[-1]: - processed_voice_content = text - - self.is_first_run = False # 标记首次运行结束 - - # 初始化 last_processed_msg,避免回复历史消息 - if dialogue_log: - last_log = dialogue_log[-1] - if last_log.startswith("对方"): - content = last_log.split(":", 1)[1].strip() - self.last_processed_msg = content - logger.info(f"🌟 首次运行,标记最后一条对方消息为已处理: {content}") - - else: - # 后续监控:只处理最后一条,且必须是未读 (is_unread=True) - if voice_messages: - last_voice = voice_messages[-1] - if last_voice.get("is_unread", False): - logger.info("🔴 发现未读语音消息 (最后一条),正在处理...") - - # 查找下一条消息 - try: - idx = messages.index(last_voice) - next_msg = messages[idx + 1] if idx + 1 < len(messages) else None - except ValueError: - next_msg = None - - text = await self.process_single_voice(last_voice, next_msg, input_y) - if text: - # 直接更新 dialogue_log 对应的条目 - dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]") - processed_voice_content = text - else: - # 增加更多调试信息,帮助定位为何跳过 - sender = last_voice.get("sender", "未知") - content = last_voice.get("content", "") - coords = last_voice.get("coordinates", []) - logger.info(f"⚪ 最后一条语音消息已读,跳过处理。[{sender}, {content}, {coords}]") - - - # 5. LLM 回复逻辑 - # 只有当有新的语音被处理并识别出文字,或者有新的文本消息时才回复 - # 既然 dialogue_log 已经更新,我们直接用 history_text + # 更新 log 中的内容 + for item in dialogue_log: + if item.get("is_voice") and f"[语音] {v_msg['content']}" in item["text"]: + item["text"] = item["text"].replace("[语音]", f"[语音转文字: {text}]") + break - history_text = "\n".join(dialogue_log) + self.is_first_run = False + + # 5. LLM 回复逻辑 + final_dialogue_texts = [item['text'] for item in dialogue_log] + history_text = "\n".join(final_dialogue_texts) # 判断是否需要回复: # 核心规则:只有当最后一条消息是“对方”说的,且内容未处理过,才回复。 @@ -465,7 +438,8 @@ class ChatBot: current_last_content = "" if dialogue_log: - last_log = dialogue_log[-1] + last_item = dialogue_log[-1] + last_log = last_item["text"] # 检查最后一条消息的发送者 if last_log.startswith("对方"): @@ -493,11 +467,20 @@ class ChatBot: self.last_processed_msg = current_last_content reply = await self.get_reply(history_text) - logger.info(f"💡 LLM 回复: {reply}") - - if reply and input_center: - # 输入并发送 - perform_input_action(self.d, input_center, reply) + if reply: + logger.info(f"💡 LLM 回复: {reply}") + + if input_center: + # 输入并发送 + perform_input_action(self.d, input_center, reply) + # 发送后,为了防止下一轮 OCR 识别到自己的回复片段并误判为对方消息 + # 我们把 last_processed_msg 设置为一个特殊的占位符,直到下一次真正识别到对方的新消息 + # 或者更简单:在下一轮循环开始前稍微多等一下,让消息气泡完全显示 + time.sleep(1) + # 将最后处理的消息内容标记为已处理,防止 LLM 回复逻辑在下一轮立即触发 + # 注意:这里的 current_last_content 是对方的最后一条 + else: + logger.warning("⚠️ LLM 未生成有效回复。") # 休眠 await asyncio.sleep(CHECK_INTERVAL) diff --git a/WeiXin/Templates/audio.jpg b/WeiXin/Templates/audio.jpg new file mode 100644 index 0000000..e3a2b40 Binary files /dev/null and b/WeiXin/Templates/audio.jpg differ