'commit'

2026-01-25 17:08:40 +08:00
parent 8292bf83d1
commit 19803f96a8
7 changed files with 185 additions and 202 deletions
--- a/WeiXin/Screenshots/t6_debug_result.jpg
+++ b/WeiXin/Screenshots/t6_debug_result.jpg
--- a/WeiXin/Screenshots/t6_menu_shot_cancel.jpg
+++ b/WeiXin/Screenshots/t6_menu_shot_cancel.jpg
--- a/WeiXin/Screenshots/t6_menu_shot_convert.jpg
+++ b/WeiXin/Screenshots/t6_menu_shot_convert.jpg
--- a/WeiXin/Screenshots/t6_monitor_temp.jpg
+++ b/WeiXin/Screenshots/t6_monitor_temp.jpg
--- a/WeiXin/Screenshots/t6_ocr_shot.jpg
+++ b/WeiXin/Screenshots/t6_ocr_shot.jpg
--- a/WeiXin/T6_AutoChatMonitor.py
+++ b/WeiXin/T6_AutoChatMonitor.py
@@ -1,12 +1,13 @@
 # coding=utf-8
-import uiautomator2 as u2
-import time
-import logging
-import sys
-import os
 import asyncio
+import logging
+import os
+import sys
+import time
 from datetime import datetime

+import uiautomator2 as u2
+
 # 添加项目根目录到 sys.path
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if project_root not in sys.path:
@@ -14,7 +15,7 @@ if project_root not in sys.path:

 from Util import Win32Patch

-from WeiXin.WxUtil import find_input_box_center, perform_input_action, get_vlm_analysis, clean_screenshots_dir, is_in_chat_interface, get_vlm_json, find_template_match, find_all_template_matches
+from WeiXin.WxUtil import perform_input_action, clean_screenshots_dir, is_in_chat_interface, find_template_match, find_all_template_matches
 from Util.LlmUtil import get_llm_response
 from Util.EasyOcrKit import EasyOcrKit

@@ -60,8 +61,7 @@ class ChatBot:
        if not os.path.exists(self.screenshot_dir):
            os.makedirs(self.screenshot_dir)
            
-        # 强制使用 CPU 模式以避免 0xC0000409 (Stack Buffer Overrun) 崩溃
-        self.ocr_kit = EasyOcrKit(gpu=False)
+        self.ocr_kit = EasyOcrKit(gpu=True)
        
        self.is_first_run = True # 首次运行标志

@@ -122,14 +122,27 @@ class ChatBot:
            
            convert_template = r"d:\dsWork\aiData\WeiXin\Templates\zhun_wen_zi.jpg"
            logger.info(f"🔍 寻找模板: {convert_template}")
-            convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.7)
+            convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.6)
            
            if not convert_btn:
-                logger.warning("❌ CV 未找到 '转文字' 按钮，取消操作。")
-                self.d.click(vx + 200, vy) # 点击空白处关闭菜单
-                return None
+                logger.warning("❌ CV 未找到 '转文字' 按钮，尝试小范围 OCR 兜底...")
+                # 尝试在该区域进行 OCR 识别，寻找 "转文字" 三个字
+                ocr_results_menu = self.ocr_kit.read_text(menu_shot_path)
+                for bbox, text, conf in ocr_results_menu:
+                    if "转文字" in text or "转文" in text or "文字" in text:
+                        cx = (bbox[0][0] + bbox[2][0]) / 2
+                        cy = (bbox[0][1] + bbox[2][1]) / 2
+                        convert_btn = (cx, cy)
+                        logger.info(f"✅ OCR 兜底找到 '转文字' 按钮: {convert_btn}")
+                        break
                
-            logger.info(f"✅ CV 找到 '转文字' 按钮: {convert_btn}")
+                if not convert_btn:
+                    logger.warning("❌ CV 和 OCR 均未找到 '转文字' 按钮，取消操作。")
+                    # 点击屏幕中心区域的空白处关闭菜单，避免点到顶部返回键
+                    self.d.click(500, 500) 
+                    return None
+                
+            logger.info(f"✅ 最终找到 '转文字' 按钮坐标: {convert_btn}")
            self.d.click(convert_btn[0], convert_btn[1])
            
            # 3. 动态等待转换
@@ -221,14 +234,14 @@ class ChatBot:
        self.d.screenshot(menu_shot_path_cancel)
        
        cancel_template = r"d:\dsWork\aiData\WeiXin\Templates\cancel_zhuan_wen_zi.jpg"
-        cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.7)
+        cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.6)
        
        if cancel_btn:
            logger.info(f"✅ CV 找到 '取消转文字' 按钮: {cancel_btn}")
            self.d.click(cancel_btn[0], cancel_btn[1])
        else:
-            logger.warning("❌ CV 未找到 '取消转文字' 按钮，尝试点击空白处关闭菜单。")
-            self.d.click(vx + 200, vy)
+            logger.warning("❌ CV 未找到 '取消转文字' 按钮，点击中心区域关闭菜单。")
+            self.d.click(500, 500)
            
        return full_text

@@ -242,6 +255,12 @@ class ChatBot:
        
        while True:
            try:
+                # 0.5 检查是否在聊天界面
+                if not is_in_chat_interface(self.d):
+                    logger.warning("📵 当前不在聊天界面，跳过扫描...")
+                    await asyncio.sleep(CHECK_INTERVAL)
+                    continue
+
                logger.info("🔍 正在扫描当前界面内容...")
                
                # 1. 截图
@@ -261,202 +280,156 @@ class ChatBot:
                    
                last_screen_md5 = current_md5
                
-                # 2. VLM 分析
-                logger.info("🧠 正在调用 VLM 分析图片...")
-                result_data = await get_vlm_analysis(tmp_shot)
+                # 2. 本地视觉分析 (替代 VLM)
+                logger.info("<EFBFBD>️ 正在进行本地视觉扫描...")
                
-                if not result_data:
-                    logger.warning("⚠️ VLM 分析返回为空，跳过本次循环。")
-                    await asyncio.sleep(CHECK_INTERVAL)
-                    continue
-
-                # 3. 解析数据构建 dialogue_log
-                messages = result_data.get("messages", [])
-                
-                # 🚨 关键修正：按 Y 坐标对消息进行排序，确保时间顺序正确
-                # VLM 返回的顺序可能不准，必须强制按屏幕位置（从上到下）排序
-                messages.sort(key=lambda m: (m.get("center") or m.get("coordinates") or [0, 0])[1])
-                
-                input_center = result_data.get("input_box")
-
-                # --- 🔴 红点补救策略 (Red Point Correction) ---
-                # VLM 有时会漏掉红点，我们使用 CV 模板匹配来修正
+                # A. 寻找语音图标 (audio.jpg) 和 红点 (red_point.jpg)
+                audio_template = r"d:\dsWork\aiData\WeiXin\Templates\audio.jpg"
                red_point_template = r"d:\dsWork\aiData\WeiXin\Templates\red_point.jpg"
+                
+                audio_matches = find_all_template_matches(tmp_shot, audio_template, threshold=0.8)
                red_points = find_all_template_matches(tmp_shot, red_point_template, threshold=0.8)
                
-                if red_points:
-                    logger.info(f"🔴 CV 检测到 {len(red_points)} 个红点，正在修正语音消息状态...")
-                    for rp in red_points:
-                        rx, ry = rp
-                        # 遍历所有消息，找到距离该红点最近的【语音消息】
-                        # 规则：红点通常在语音消息的右侧，Y轴差异不大
-                        best_match_msg = None
-                        min_dist = 9999
-                        
-                        for msg in messages:
-                            if msg.get("type") == "voice":
-                                coords = msg.get("center") or msg.get("coordinates")
-                                if coords:
-                                    mx, my = coords
-                                    # 检查 Y 轴距离 (红点应该和语音气泡在同一行，容差 50px)
-                                    if abs(my - ry) < 50:
-                                        # 检查 X 轴关系 (红点在语音气泡右侧)
-                                        if rx > mx: 
-                                            dist = ((rx - mx)**2 + (ry - my)**2)**0.5
-                                            if dist < min_dist:
-                                                min_dist = dist
-                                                best_match_msg = msg
-                        
-                        if best_match_msg:
-                            # 只有当距离合理（比如小于 300px，视气泡长度而定，但红点一般紧挨着）
-                            # 考虑到长语音气泡可能很长，中心点在中间，红点在最右边，距离可能较远
-                            # 所以主要依赖 Y 轴对齐和 X 轴方向。
-                            # 这里直接标记
-                            if not best_match_msg.get("is_unread", False):
-                                best_match_msg["is_unread"] = True
-                                logger.info(f"🔴 修正：标记语音消息 {best_match_msg.get('content')} 为未读 (红点坐标: {rp})")
-                # ---------------------------------------------
-
+                # B. 本地 OCR 识别全文以构建上下文
+                ocr_results = self.ocr_kit.read_text(tmp_shot)
+                # 按 Y 坐标排序
+                ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2)
                
-                # --- Debug Visualization ---
-                try:
-                    import cv2
-                    import numpy as np
-                    debug_img = cv2.imread(tmp_shot)
-                    if debug_img is not None:
-                        # Draw input box
-                        if input_center:
-                            ix, iy = input_center
-                            cv2.circle(debug_img, (int(ix), int(iy)), 10, (0, 0, 255), -1) # Red dot
-                            cv2.putText(debug_img, "Input", (int(ix), int(iy)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
-
-                        for msg in messages:
-                            m_type = msg.get("type", "text")
-                            coords = msg.get("center") or msg.get("coordinates")
-                            
-                            if coords:
-                                cx, cy = int(coords[0]), int(coords[1])
-                                if m_type == "voice":
-                                    # Green box for voice
-                                    # We don't have w/h, so just draw a fixed size box or circle
-                                    # Let's draw a rectangle around the center
-                                    cv2.rectangle(debug_img, (cx-50, cy-25), (cx+50, cy+25), (0, 255, 0), 3)
-                                    cv2.circle(debug_img, (cx, cy), 5, (0, 255, 0), -1)
-                                    cv2.putText(debug_img, "Voice", (cx, cy-30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
-                                # else:
-                                #     # Blue box for text
-                                #     cv2.rectangle(debug_img, (cx-50, cy-25), (cx+50, cy+25), (255, 0, 0), 2)
-                                #     cv2.putText(debug_img, "Text", (cx, cy-30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
-
-                        debug_path = os.path.join(self.screenshot_dir, "t6_debug_result.jpg")
-                        cv2.imwrite(debug_path, debug_img)
-                        logger.info(f"🐛 Debug 标记图已保存: {debug_path}")
-                except Exception as e:
-                    logger.error(f"Debug drawing failed: {e}")
-                # ---------------------------
-
                dialogue_log = []
-                voice_messages = [] # 存储所有语音消息
+                voice_messages = []
                
-                for i, msg in enumerate(messages):
-                    # 简单的发送者判断
-                    sender_val = msg.get("sender", "对方")
-                    if sender_val in ["Me", "我"]:
-                        sender_name = "我"
-                    else:
-                        sender_name = "对方"
-                        
-                    msg_type = msg.get("type", "text")
-                    content = msg.get("content", "")
-                    # status = msg.get("status", "unconverted") # 不再依赖 status
-                    is_unread = msg.get("is_unread", False)
+                # 准备可视化调试图
+                import cv2
+                import numpy as np
+                debug_img = cv2.imread(tmp_shot)
+
+                # 记录已匹配到语音图标的 OCR 块索引
+                matched_ocr_indices = set()
+
+                # 先处理语音图标匹配
+                for ax, ay in audio_matches:
+                    # 排除顶部标题栏(0-300)和底部输入区(1800+)
+                    if ay < 300 or ay > 1800: 
+                        logger.info(f"⏭️ 忽略区域外语音图标: ({ax}, {ay})")
+                        continue
                    
-                    if msg_type == "voice":
-                        coords = msg.get("center") or msg.get("coordinates")
-                        if coords:
-                            msg["coordinates"] = coords
-                            # 只处理“对方”的语音消息，忽略“我”发送的语音
-                            if sender_name != "我":
-                                voice_messages.append(msg)
-                            
-                        # 在日志中暂时标记为 [语音]，稍后如果处理了会更新
-                        # 但为了日志完整性，我们这里先占位
-                        # 实际上，我们需要知道这个语音的内容才能放入 context
-                        # 如果没有内容，只能放 [语音]
-                        # 只有被处理过的语音，我们才能获取内容。
-                        # 对于历史语音，如果我们不处理（非首次运行且无红点），我们无法知道内容。
-                        # 所以这里只能 append 占位符。
-                        dialogue_log.append(f"{sender_name}: [语音] {content}")
+                    sender = "对方" if ax < 500 else "我"
+                    logger.info(f"🎙️ 发现语音图标: x={ax}, y={ay}, 发送者={sender}")
+                    is_unread = False
+                    if red_points:
+                        for rx, ry in red_points:
+                            # 红点通常在语音图标右侧，且 Y 轴相近
+                            if abs(ry - ay) < 50 and rx > ax:
+                                is_unread = True
+                                # 绘制红点
+                                cv2.circle(debug_img, (int(rx), int(ry)), 12, (0, 0, 255), -1)
+                                break
+                    
+                    # 寻找附近的时长文字 (OCR)
+                    duration_text = "语音"
+                    for idx, (bbox, text, conf) in enumerate(ocr_results):
+                        c_x = (bbox[0][0] + bbox[2][0]) / 2
+                        c_y = (bbox[0][1] + bbox[2][1]) / 2
+                        if abs(c_y - ay) < 40 and abs(c_x - ax) < 300:
+                            if '"' in text or text.isdigit():
+                                duration_text = text
+                                matched_ocr_indices.add(idx)
+                                break
+                    
+                    # 计算点击坐标：直接点击语音图标中心
+                    click_x, click_y = ax, ay
+                    
+                    # 绘制视觉反馈
+                    # 语音图标用绿框
+                    cv2.rectangle(debug_img, (int(ax-30), int(ay-30)), (int(ax+30), int(ay+30)), (0, 255, 0), 3)
+                    # 点击位置用红十字
+                    cv2.drawMarker(debug_img, (int(click_x), int(click_y)), (0, 0, 255), cv2.MARKER_CROSS, 35, 3)
+
+                    v_msg = {
+                        "type": "voice",
+                        "content": duration_text,
+                        "coordinates": [click_x, click_y],
+                        "sender": sender,
+                        "is_unread": is_unread
+                    }
+                    if sender == "对方":
+                        voice_messages.append(v_msg)
+                    dialogue_log.append({
+                        "y": ay,
+                        "text": f"{sender}: [语音] {duration_text}",
+                        "is_voice": True,
+                        "id": f"voice_{ax}_{ay}"
+                    })
+
+                # 处理剩余的 OCR 文字块 (普通文本)
+                for idx, (bbox, text, conf) in enumerate(ocr_results):
+                    if idx in matched_ocr_indices: continue
+                    
+                    x_min, x_max = bbox[0][0], bbox[2][0]
+                    y_min, y_max = bbox[0][1], bbox[2][1]
+                    c_x, c_y = (x_min + x_max) / 2, (y_min + y_max) / 2
+                    
+                    if c_y < 300 or c_y > 1800: continue
+                    
+                    if x_min < 250 and x_max < 700:
+                        sender, color = "对方", (0, 255, 0)
+                    elif x_max > 800 and x_min > 300:
+                        sender, color = "我", (255, 0, 0)
                    else:
-                        dialogue_log.append(f"{sender_name}: {content}")
-                
-                logger.info(f"📑 界面扫描完成，当前对话历史共 {len(dialogue_log)} 条")
+                        sender, color = "系统", (128, 128, 128)
+                    
+                    if sender != "系统":
+                        logger.info(f"💬 发现文本消息: x={c_x}, y={c_y}, 发送者={sender}, 内容={text}")
+                        cv2.rectangle(debug_img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), color, 1)
+                        dialogue_log.append({
+                            "y": c_y,
+                            "text": f"{sender}: {text}",
+                            "is_voice": False
+                        })
+
+                # 按 Y 轴重新排序整个对话日志
+                dialogue_log.sort(key=lambda x: x['y'])
+
+                # 保存调试图
+                debug_shot_path = os.path.join(self.screenshot_dir, "t6_debug_view.jpg")
+                cv2.imwrite(debug_shot_path, debug_img)
+                logger.info(f"🎨 已保存视觉调试图: {debug_shot_path}")
+
+                # C. 寻找输入框 (CV 模板匹配)
+                input_template = r"d:\dsWork\aiData\WeiXin\Templates\input_box.jpg" # 假设有这个模板
+                input_center = find_template_match(tmp_shot, input_template, threshold=0.6)
+                if not input_center:
+                    # 几何兜底：屏幕底部 88% 处
+                    from PIL import Image
+                    with Image.open(tmp_shot) as img:
+                        w, h = img.size
+                        input_center = [w // 2, int(h * 0.88)]
+                        logger.info(f"<EFBFBD> 使用几何兜底输入框坐标: {input_center}")

                # 4. 语音处理逻辑
                processed_voice_content = None
                input_y = input_center[1] if input_center else None
                
-                if self.is_first_run:
-                    logger.info("🌟 首次运行：处理屏幕上所有语音消息...")
-                    for v_msg in voice_messages:
-                        # 查找下一条消息，用于限定 OCR 范围
-                        try:
-                            idx = messages.index(v_msg)
-                            next_msg = messages[idx + 1] if idx + 1 < len(messages) else None
-                        except ValueError:
-                            next_msg = None
-
-                        # 无论是否未读，都处理
-                        text = await self.process_single_voice(v_msg, next_msg, input_y)
+                # 只有未读的才处理
+                for v_msg in voice_messages:
+                    if v_msg.get("is_unread") or self.is_first_run:
+                        logger.info(f"🔴 发现未读/待处理语音: {v_msg['content']}")
+                        # 找到 OCR 结果中的下一条作为边界
+                        idx = -1
+                        # 这里简化逻辑，直接处理
+                        text = await self.process_single_voice(v_msg, None, input_y)
                        if text:
-                            # 直接更新 dialogue_log 对应的条目
-                            dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]")
-                            
-                            if v_msg == voice_messages[-1]:
-                                processed_voice_content = text
-                    
-                    self.is_first_run = False # 标记首次运行结束
-
-                    # 初始化 last_processed_msg，避免回复历史消息
-                    if dialogue_log:
-                        last_log = dialogue_log[-1]
-                        if last_log.startswith("对方"):
-                             content = last_log.split(":", 1)[1].strip()
-                             self.last_processed_msg = content
-                             logger.info(f"🌟 首次运行，标记最后一条对方消息为已处理: {content}")
-                    
-                else:
-                    # 后续监控：只处理最后一条，且必须是未读 (is_unread=True)
-                    if voice_messages:
-                        last_voice = voice_messages[-1]
-                        if last_voice.get("is_unread", False):
-                            logger.info("🔴 发现未读语音消息 (最后一条)，正在处理...")
-                            
-                            # 查找下一条消息
-                            try:
-                                idx = messages.index(last_voice)
-                                next_msg = messages[idx + 1] if idx + 1 < len(messages) else None
-                            except ValueError:
-                                next_msg = None
-                                
-                            text = await self.process_single_voice(last_voice, next_msg, input_y)
-                            if text:
-                                # 直接更新 dialogue_log 对应的条目
-                                dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]")
-                                processed_voice_content = text
-                        else:
-                            # 增加更多调试信息，帮助定位为何跳过
-                            sender = last_voice.get("sender", "未知")
-                            content = last_voice.get("content", "")
-                            coords = last_voice.get("coordinates", [])
-                            logger.info(f"⚪ 最后一条语音消息已读，跳过处理。[{sender}, {content}, {coords}]")
-
-
-                # 5. LLM 回复逻辑
-                # 只有当有新的语音被处理并识别出文字，或者有新的文本消息时才回复
-                # 既然 dialogue_log 已经更新，我们直接用 history_text
+                            # 更新 log 中的内容
+                            for item in dialogue_log:
+                                if item.get("is_voice") and f"[语音] {v_msg['content']}" in item["text"]:
+                                    item["text"] = item["text"].replace("[语音]", f"[语音转文字: {text}]")
+                                    break
                
-                history_text = "\n".join(dialogue_log)
+                self.is_first_run = False
+                
+                # 5. LLM 回复逻辑
+                final_dialogue_texts = [item['text'] for item in dialogue_log]
+                history_text = "\n".join(final_dialogue_texts)
                
                # 判断是否需要回复：
                # 核心规则：只有当最后一条消息是“对方”说的，且内容未处理过，才回复。
@@ -465,7 +438,8 @@ class ChatBot:
                current_last_content = ""

                if dialogue_log:
-                    last_log = dialogue_log[-1]
+                    last_item = dialogue_log[-1]
+                    last_log = last_item["text"]
                    
                    # 检查最后一条消息的发送者
                    if last_log.startswith("对方"):
@@ -493,11 +467,20 @@ class ChatBot:
                    self.last_processed_msg = current_last_content
                    
                    reply = await self.get_reply(history_text)
-                    logger.info(f"💡 LLM 回复: {reply}")
-                    
-                    if reply and input_center:
-                         # 输入并发送
-                         perform_input_action(self.d, input_center, reply)
+                    if reply:
+                        logger.info(f"💡 LLM 回复: {reply}")
+                        
+                        if input_center:
+                             # 输入并发送
+                             perform_input_action(self.d, input_center, reply)
+                             # 发送后，为了防止下一轮 OCR 识别到自己的回复片段并误判为对方消息
+                             # 我们把 last_processed_msg 设置为一个特殊的占位符，直到下一次真正识别到对方的新消息
+                             # 或者更简单：在下一轮循环开始前稍微多等一下，让消息气泡完全显示
+                             time.sleep(1)
+                             # 将最后处理的消息内容标记为已处理，防止 LLM 回复逻辑在下一轮立即触发
+                             # 注意：这里的 current_last_content 是对方的最后一条
+                    else:
+                        logger.warning("⚠️ LLM 未生成有效回复。")
                
                # 休眠
                await asyncio.sleep(CHECK_INTERVAL)
--- a/WeiXin/Templates/audio.jpg
+++ b/WeiXin/Templates/audio.jpg