'commit'

2026-01-26 10:50:11 +08:00
parent 4868198143
commit a662c33ecf
6 changed files with 263 additions and 924 deletions
--- a/WeiXin/T2_GetHistory.py
+++ b/WeiXin/T2_GetHistory.py
@@ -1,75 +0,0 @@
-# coding=utf-8
-import time
-import logging
-import sys
-import os
-import asyncio
-
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if project_root not in sys.path:
-    sys.path.append(project_root)
-
-from WeiXin import WxUtil
-from WeiXin.WxUtil import analyze_chat_image
-
-# 配置日志
-log_dir = WxUtil.LOG_DIR
-if not os.path.exists(log_dir):
-    os.makedirs(log_dir)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler(os.path.join(log_dir, "T2_GetHistory.log"), encoding='utf-8'),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger("T2_GetHistory")
-
-async def get_history(target_name="对方"):
-    # 运行前清理 Logs 和 Output
-    WxUtil.setup_script_environment()
-    
-    logger.info("开始执行 T2: 获取当前屏幕对话历史...")
-    
-    d = WxUtil.connect_device()
-    if not d:
-        return
-
-    # 截图
-    screenshot_dir = WxUtil.OUTPUT_DIR
-    if not os.path.exists(screenshot_dir):
-        os.makedirs(screenshot_dir)
-    
-    timestamp = time.strftime("%Y%m%d_%H%M%S")
-    filename = f"T2_history_{timestamp}.jpg"
-    save_path = os.path.join(screenshot_dir, filename)
-    
-    try:
-        d.screenshot(save_path)
-        logger.info(f"截图已保存: {save_path}")
-        
-        analyzed_filename = f"T2_history_{timestamp}_analyzed.jpg"
-        analyzed_path = os.path.join(screenshot_dir, analyzed_filename)
-        
-        # 调用 WxUtil 中的分析函数
-        dialogue_log, input_box = await analyze_chat_image(save_path, analyzed_path, device=d, target_name=target_name)
-        
-        logger.info("✅ T2 识别结果：")
-        if dialogue_log:
-            for log in dialogue_log:
-                print(log) # 打印到控制台
-        else:
-            logger.info("未提取到对话内容或当前屏幕无对话气泡。")
-
-        # 检查是否触发了转换
-        if isinstance(dialogue_log, list) and any("[正在转换语音...]" in str(msg) for msg in dialogue_log):
-            logger.info("检测到语音正在转文字，建议等待转换完成后重新运行 T2 以获取完整内容。")
-            return
-            
-    except Exception as e:
-        logger.error(f"❌ T2 执行失败: {e}")
-
-if __name__ == "__main__":
-    asyncio.run(get_history())
--- a/WeiXin/T3_InputLlmText.py
+++ b/WeiXin/T3_InputLlmText.py
@@ -1,135 +0,0 @@
-# coding=utf-8
-import time
-import logging
-import sys
-import os
-import asyncio
-
-# 添加项目根目录到 sys.path 以便导入 Util
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if project_root not in sys.path:
-    sys.path.append(project_root)
-
-from WeiXin import WxUtil
-from WeiXin.WxUtil import find_input_box_center, perform_input_action, analyze_chat_image, clean_screenshots_dir
-from Util.LlmUtil import get_llm_response
-
-# 配置日志
-log_dir = WxUtil.LOG_DIR
-if not os.path.exists(log_dir):
-    os.makedirs(log_dir)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler(os.path.join(log_dir, "T3_InputLlmText.log"), encoding='utf-8'),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger("T3_InputLlmText")
-
-async def generate_and_input():
-    # 运行前清理 Logs 和 Output
-    WxUtil.setup_script_environment()
-    
-    logger.info("开始执行 T3: 生成 LLM 回复并输入...")
-    
-    try:
-        # 1. 连接设备
-        d = WxUtil.connect_device()
-        if not d:
-            return
-
-        # 2. 截图并识别对话历史
-        screenshot_dir = WxUtil.OUTPUT_DIR
-        
-        tmp_shot = os.path.join(screenshot_dir, "t4_temp_history_check.jpg")
-        analyzed_shot = os.path.join(screenshot_dir, "t4_temp_history_analyzed.jpg")
-        
-        d.screenshot(tmp_shot)
-        dialogue_log, input_box = await analyze_chat_image(tmp_shot, analyzed_shot, device=d)
-        
-        # 检查是否包含正在转换的标识
-        is_converting = any("[正在转换语音...]" in str(msg) for msg in dialogue_log) if isinstance(dialogue_log, list) else False
-        
-        if is_converting:
-            logger.info("检测到语音正在转文字，等待 3 秒后重新截图分析...")
-            await asyncio.sleep(3)
-            d.screenshot(tmp_shot)
-            dialogue_log, input_box = await analyze_chat_image(tmp_shot, analyzed_shot, device=d)
-            
-            # 无论第二次结果如何，我们都继续执行，不再跳过
-            if any("[正在转换语音...]" in str(msg) for msg in dialogue_log) if isinstance(dialogue_log, list) else False:
-                logger.warning("语音转换时间较长，将尝试根据当前已有内容生成回复。")
-
-        history_text = ""
-        if dialogue_log and isinstance(dialogue_log, list):
-            # 过滤掉系统的转换提示语，避免干扰 LLM
-            filtered_log = [msg for msg in dialogue_log if "[正在转换语音...]" not in str(msg)]
-            history_text = "\n".join(filtered_log)
-            logger.info("提取到对话历史: ")
-            for msg in filtered_log:
-                logger.info(msg)
-        elif dialogue_log:
-            history_text = str(dialogue_log)
-            logger.info(f"提取到对话历史: {history_text}")
-        else:
-            logger.warning("未提取到对话历史")
-            history_text = "（无对话历史）"
-
-        # 3. 调用 LLM 生成回复
-        persona = (
-            "你是一名1999年毕业、拥有27年一线教学经验的小学高级教师，名叫‘大张老师’。你目前在‘长春市少惠林作文素养培养中心’工作。"
-            "你不仅是一位作文教学专家，批阅过超过2万篇作文，更是一位懂得孩子心理、能与家长共情的教育智者。"
-            "你的回复风格应该是：温厚、亲切、睿智且极具亲和力，就像一位老邻居、老大哥在和家长坐在长椅上聊天。"
-            "严禁使用任何列表格式（如 1. 2. 3. 或 A. B. C.），严禁使用‘首先、其次、最后’等刻板的逻辑词。"
-            "要用连贯、优美、富有生活气息的文字进行叙述。每一句话都要带温度，要善于用‘咱们孩子’、‘作为老师我也理解’等词汇拉近距离。"
-            "回复内容要有深度，不要只给结论，要讲透背后的教育逻辑。字数必须严格控制在 100-200 字之间。"
-            "如果涉及到校区信息，必须且只能使用以下真实数据，绝对严禁使用任何占位符：\n"
-            "- 单位：长春市少惠林作文素养培养中心\n"
-            "- 地址：南环城路与临河街交汇，TOUCH12街3楼325号\n"
-            "- 联系人：小张老师（电话：18686619970）\n"
-            "- 课程：线上/线下作文课、读书营/阅读策略营（假期开设）\n"
-            "- 上课：作文通常在周二/周四晚上，周六/周日全天"
-        )
-        prompt = (
-            f"【教师人设】：{persona}\n\n"
-            f"【近期聊天记录】：\n{history_text}\n\n"
-            "【任务要求】：\n"
-            "请作为大张老师，给家长写一段暖心且有深度的回复。要针对家长最后提到的问题或状态，先给予情感上的安抚和理解，"
-            "然后再结合你27年的教学经验，给出具体的建议。建议要讲得细致、感人，展现出老教师的智慧沉淀。"
-            "全文必须是一个或两个完整的自然段，绝对禁止分点列项！字数严格在 100-200 字之间。"
-            "如果需要提供联系方式或地址，请自然地揉进段落中，严禁出现'XX'占位符。直接输出回复的正文内容，不要包含任何多余的修饰词或引号。"
-        )
-        logger.info(f"正在以'亲切而专业的特级教师'身份请求 LLM 生成深度回复...")
-        
-        full_response = ""
-        async for chunk in get_llm_response(prompt, stream=False):
-            full_response += chunk
-            
-        llm_text = full_response.strip().strip('"').strip('“').strip('”')
-        logger.info(f"LLM 生成的回复内容: {llm_text}")
-        
-        if not llm_text:
-            logger.error("LLM 生成内容为空，停止执行。")
-            return
-
-        # 4. 识别输入框位置
-        center_point, rect_box = find_input_box_center(tmp_shot)
-        
-        # 5. 执行输入动作
-        # 即使 center_point 为 None，perform_input_action 也会尝试通过原生控件识别输入框
-        logger.info(f"正在准备输入回复内容...")
-        success = perform_input_action(d, center_point, llm_text, auto_send=True)
-        
-        if success:
-            logger.info("✅ T3 执行完成：文字已成功输入并点击发送。")
-        else:
-            logger.error("❌ T3 执行失败：输入动作未成功完成。")
-            
-    except Exception as e:
-        logger.error(f"❌ T3 执行出错: {e}", exc_info=True)
-
-if __name__ == "__main__":
-    asyncio.run(generate_and_input())
--- a/WeiXin/T4_CV_Voice_Debug.py
+++ b/WeiXin/T4_CV_Voice_Debug.py
@@ -30,7 +30,9 @@ logging.basicConfig(
 logger = logging.getLogger("T4_CV_Voice_Debug")


-def run_cv_debug():
+import asyncio
+
+async def run_cv_debug():
    # 运行前清理 Logs 和 Output
    WxUtil.setup_script_environment()
    
@@ -42,7 +44,6 @@ def run_cv_debug():
    
    try:
        screenshot_dir = WxUtil.OUTPUT_DIR
-            
        image_path = os.path.join(screenshot_dir, "t4_live_shot.jpg")
        output_path = os.path.join(screenshot_dir, "T4_debug_view.jpg")
        
@@ -52,67 +53,20 @@ def run_cv_debug():
        logger.error(f"❌ 拍照失败: {e}")
        return

-    logger.info(f"🔍 正在分析实时图片...")
+    logger.info(f"🔍 正在调用 WxUtil.analyze_chat_image 分析最后一条消息...")
    
-    # 模板路径
-    template_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Templates")
-    audio_template = os.path.join(template_dir, "audio.jpg")
-    red_point_template = os.path.join(template_dir, "red_point.jpg")
+    # 2. 调用新的分析逻辑
+    dialogue_log, input_pos = await WxUtil.analyze_chat_image(image_path, output_path, device=d)
    
-    if not os.path.exists(audio_template) or not os.path.exists(red_point_template):
-        logger.error("错误: 模板文件不存在")
-        return
+    if dialogue_log:
+        logger.info("📢 识别到的最后一条消息:")
+        for line in dialogue_log:
+            logger.info(f"  {line}")
+    else:
+        logger.warning("⚠️ 未识别到任何消息")
        
-    # 2. 识别逻辑
-    audio_matches = find_all_template_matches(image_path, audio_template, threshold=0.8)
-    red_points = find_all_template_matches(image_path, red_point_template, threshold=0.8)
-    
-    logger.info(f"发现语音图标数量: {len(audio_matches)}")
-    logger.info(f"发现红点数量: {len(red_points)}")
-
-    # 3. 读取图片并绘制
-    img = cv2.imread(image_path)
-    if img is None:
-        logger.error("错误: 无法读取图片")
-        return
-
-    for ax, ay in audio_matches:
-        # 排除顶部标题栏和底部输入区 (假设 300-1800 为有效区)
-        if ay < 300 or ay > 1800:
-            continue
-            
-        sender = "对方" if ax < 500 else "我"
-        
-        # --- 1. 先判断是否未读 (寻找附近的红点) ---
-        is_unread = False
-        for rx, ry in red_points:
-            if abs(ry - ay) < 50 and rx > ax:
-                is_unread = True
-                break
-        
-        # --- 2. 根据状态选择颜色 ---
-        # BGR 格式: 红色 (0, 0, 255), 绿色 (0, 255, 0)
-        color = (0, 0, 255) if is_unread else (0, 255, 0)
-        status_text = "未读" if is_unread else "已读"
-        
-        # --- 3. 绘制标注 ---
-        # 语音图标框 (加粗)
-        cv2.rectangle(img, (int(ax-35), int(ay-35)), (int(ax+35), int(ay+35)), color, 3)
-        
-        # 中心点击位置 (实心圆)
-        cv2.circle(img, (int(ax), int(ay)), 15, color, -1)
-        
-        # 如果是未读，把原本识别到的红点也再次标出
-        if is_unread:
-            for rx, ry in red_points:
-                if abs(ry - ay) < 50 and rx > ax:
-                    cv2.circle(img, (int(rx), int(ry)), 12, (0, 0, 255), -1)
-        
-        logger.info(f"标注语音消息: ({ax}, {ay}), 发送者: {sender}, 状态: {status_text}")
-
-    # 保存结果
-    cv2.imwrite(output_path, img)
-    logger.info(f"✅ 调试图片已保存至: {output_path}")
+    if input_pos:
+        logger.info(f"📍 识别到输入框位置: {input_pos}")

 if __name__ == "__main__":
-    run_cv_debug()
+    asyncio.run(run_cv_debug())
--- a/WeiXin/T5_AutoChatMonitor.py
+++ b/WeiXin/T5_AutoChatMonitor.py
@@ -5,6 +5,7 @@ import os
 import sys
 import time
 from datetime import datetime
+import hashlib

 # 添加项目根目录到 sys.path
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -12,11 +13,9 @@ if project_root not in sys.path:
    sys.path.append(project_root)

 from Util import Win32Patch
-
 from WeiXin import WxUtil
-from WeiXin.WxUtil import perform_input_action, clean_screenshots_dir, find_template_match, find_all_template_matches
+from WeiXin.WxUtil import perform_input_action
 from Util.LlmUtil import get_llm_response
-from Util.EasyOcrKit import EasyOcrKit

 # 配置日志
 log_dir = WxUtil.LOG_DIR
@@ -27,30 +26,23 @@ if not os.path.exists(log_dir):
 logger = logging.getLogger("T5_AutoChatMonitor")
 logger.setLevel(logging.INFO)

-# 清除现有的 handlers，防止重复打印或配置冲突
 if logger.hasHandlers():
    logger.handlers.clear()

-# 创建 FileHandler
 log_file_path = os.path.join(log_dir, "T5_AutoChatMonitor.log")
 file_handler = logging.FileHandler(log_file_path, encoding='utf-8', mode='w')
 file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 logger.addHandler(file_handler)

-# 创建 StreamHandler
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 logger.addHandler(stream_handler)

-# 防止日志传播到 root logger，避免重复输出
 logger.propagate = False
-
-# 打印日志文件位置，方便确认
 logger.info(f"日志文件路径: {log_file_path}")

 # 配置参数
 CHECK_INTERVAL = 5  # 检查频率 (秒)
-TEMPLATE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Templates")

 class ChatBot:
    def __init__(self):
@@ -60,14 +52,10 @@ class ChatBot:
        self.d = WxUtil.connect_device()
        if not self.d:
            raise Exception("无法连接到设备，任务终止")
-        self.last_message_text = ""
-        self.last_processed_msg_id = None # 记录上一条已处理的消息标识 (文本+坐标)
+        
+        self.last_processed_msg_hash = None # 记录最后一条已处理消息的哈希值
        self.screenshot_dir = WxUtil.OUTPUT_DIR
        
-        self.ocr_kit = EasyOcrKit(gpu=True)
-        
-        self.is_first_run = True # 首次运行标志
-
        self.persona = (
            "你是一名1999年毕业、拥有27年一线教学经验的小学高级女教师，名叫‘大张老师’。你目前在‘长春市少惠林作文素养培养中心’工作。"
            "你不仅是一位作文教学专家，更是一位心思细腻、能与家长共情的教育智者。"
@@ -85,13 +73,12 @@ class ChatBot:
            "- 每学期开学招收小学三年级至六年级，初中七年级的学生入学，其它年段不招生。\n"
        )

-    async def get_reply(self, history_text):
+    async def get_reply(self, last_message_text):
        prompt = (
            f"【教师人设】：{self.persona}\n\n"
-            f"【近期聊天记录】：\n{history_text}\n\n"
+            f"【最后一条消息】：\n{last_message_text}\n\n"
            "【任务要求】：\n"
-            "请作为大张老师回复家长。**必须且只能针对聊天记录中的最后一条消息进行回复！**\n"
-            "之前的聊天记录仅供参考上下文，如果之前的问题已经回答过，绝对不要重复回答。\n"
+            "请作为大张老师回复家长。**必须且只能针对最后一条消息进行回复！**\n"
            "严禁发散，严禁编造家长没说过的情况。如果不清楚家长的意图，就温柔询问。\n"
            "字数严格控制在 50 字以内。直接输出回复正文。"
        )
@@ -101,367 +88,63 @@ class ChatBot:
            full_response += chunk
        return full_response.strip().strip('"').strip('“').strip('”')

-    async def process_single_voice(self, voice_msg, next_msg=None, input_box_y=None):
-        """
-        处理单个语音消息的完整流程：
-        长按 -> CV找转文字 -> 点击 -> 等待 -> 截图OCR -> 长按 -> CV找取消转文字 -> 点击
-        返回: 转换后的文本内容 (如果没有转换成功，返回 None)
-        """
-        vx, vy = voice_msg['coordinates']
-        content = voice_msg.get('content', '0"')
-        logger.info(f"🎤 开始处理语音消息: {content}, 坐标: ({vx}, {vy})")
-        
-        try:
-            # 1. 长按语音消息
-            logger.info("👆 正在长按语音消息...")
-            self.d.long_click(vx, vy, 0.6)
-            logger.info("✅ 长按完成，等待菜单...")
-            time.sleep(0.3)
-            
-            # 2. CV 模板匹配寻找 "转文字" 按钮
-            menu_shot_path = os.path.join(self.screenshot_dir, "t6_menu_shot_convert.jpg")
-            logger.info(f"📸 截取菜单图: {menu_shot_path}")
-            self.d.screenshot(menu_shot_path)
-            
-            convert_template = os.path.join(TEMPLATE_DIR, "zhun_wen_zi.jpg")
-            logger.info(f"🔍 寻找模板: {convert_template}")
-            convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.6)
-            
-            if not convert_btn:
-                logger.warning("❌ CV 未找到 '转文字' 按钮，尝试小范围 OCR 兜底...")
-                # 尝试在该区域进行 OCR 识别，寻找 "转文字" 三个字
-                ocr_results_menu = self.ocr_kit.read_text(menu_shot_path)
-                for bbox, text, conf in ocr_results_menu:
-                    if "转文字" in text or "转文" in text or "文字" in text:
-                        cx = (bbox[0][0] + bbox[2][0]) / 2
-                        cy = (bbox[0][1] + bbox[2][1]) / 2
-                        convert_btn = (cx, cy)
-                        logger.info(f"✅ OCR 兜底找到 '转文字' 按钮: {convert_btn}")
-                        break
-                
-                if not convert_btn:
-                    logger.warning("❌ CV 和 OCR 均未找到 '转文字' 按钮，取消操作。")
-                    # 点击屏幕中心区域的空白处关闭菜单，避免点到顶部返回键
-                    self.d.click(500, 500) 
-                    return None
-                
-            logger.info(f"✅ 最终找到 '转文字' 按钮坐标: {convert_btn}")
-            self.d.click(convert_btn[0], convert_btn[1])
-            
-            # 3. 动态等待转换
-            duration_str = content.replace('"', '').strip()
-            try:
-                duration = int(duration_str)
-            except:
-                duration = 10
-            wait_seconds = max(2, duration / 5.0)
-            logger.info(f"⏳ 语音时长 {duration}s，等待转换 {wait_seconds:.1f}s...")
-            time.sleep(wait_seconds)
-            
-            # 4. 截图并 OCR 识别内容
-            ocr_shot_path = os.path.join(self.screenshot_dir, "t6_ocr_shot.jpg")
-            logger.info(f"📸 截取 OCR 识别图: {ocr_shot_path}")
-            self.d.screenshot(ocr_shot_path)
-            
-            # OCR 识别
-            # 策略：识别整个屏幕，但只提取位于当前语音消息下方，且在下一条消息（如果有）上方的内容
-            logger.info("📖 开始 OCR 识别...")
-            ocr_results = self.ocr_kit.read_text(ocr_shot_path)
-            logger.info(f"✅ OCR 识别完成，获取 {len(ocr_results)} 个文本块")
-        except Exception as e:
-            logger.error(f"❌ process_single_voice 发生异常: {e}", exc_info=True)
-            return None
-        
-        # 按 Y 坐标排序，确保从上往下处理
-        ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2)
-        
-        extracted_text = []
-        
-        # 准备下一条消息的内容片段作为停止条件
-        next_msg_snippet = None
-        if next_msg and next_msg.get("type") == "text":
-            c = next_msg.get("content", "").strip()
-            if c:
-                next_msg_snippet = c[:8] # 取前8个字符作为指纹
-        
-        for bbox, text, conf in ocr_results:
-            # bbox center y
-            c_y = (bbox[0][1] + bbox[2][1]) / 2
-            
-            # 1. 过滤掉当前语音气泡及以上的内容
-            # 语音气泡中心是 vy，底部大概在 vy + 30 左右
-            if c_y <= vy + 25:
-                continue
-                
-            # 2. 如果有输入框坐标，过滤掉输入框以下的内容
-            if input_box_y and c_y >= input_box_y - 30:
-                continue
-            
-            # 3. 如果遇到下一条消息的内容，停止读取
-            if next_msg_snippet and next_msg_snippet in text:
-                logger.info(f"🛑 遇到下一条消息内容 '{text}'，停止 OCR 录入。")
-                break
-                
-            # 4. 如果下一条是语音，尝试通过时长文本判断停止
-            if next_msg and next_msg.get("type") == "voice":
-                v_dur = next_msg.get("content", "").strip()
-                # 语音时长通常比较短，且包含 " 符号
-                if v_dur and v_dur in text and len(text) < 10:
-                    logger.info(f"🛑 遇到下一条语音时长 '{text}'，停止 OCR 录入。")
-                    break
-            
-            # 5. 安全兜底：如果距离当前语音气泡太远（超过600像素），停止
-            # 这可以防止读取到屏幕底部无关的内容
-            if c_y > vy + 600:
-                break
-                
-            extracted_text.append(text)
-            
-        full_text = " ".join(extracted_text)
-        logger.info(f"📝 OCR 识别结果: {full_text}")
-        
-        # 5. 再次长按语音消息 (为了取消转换)
-        # 注意：转换出文字后，界面可能会发生位移。
-        # 但通常语音气泡的相对位置（如果是最后一条）可能变化不大，或者我们假设用户不滑动
-        # 更稳妥的是：重新识别一次语音气泡位置？
-        # 用户说："这样原来什么样，识别完就是什么样"，意味着我们要恢复原状。
-        # 我们假设点击原来的位置还能点到语音气泡（如果它没被顶上去太多）
-        # 或者，我们可以点击转换出来的文字区域？
-        # 让我们尝试点击原来的坐标。
-        
-        self.d.long_click(vx, vy, 0.6)
-        time.sleep(0.3)
-        
-        # 6. CV 模板匹配寻找 "取消转文字" 按钮
-        menu_shot_path_cancel = os.path.join(self.screenshot_dir, "t6_menu_shot_cancel.jpg")
-        self.d.screenshot(menu_shot_path_cancel)
-        
-        cancel_template = os.path.join(TEMPLATE_DIR, "cancel_zhuan_wen_zi.jpg")
-        cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.6)
-        
-        if cancel_btn:
-            logger.info(f"✅ CV 找到 '取消转文字' 按钮: {cancel_btn}")
-            self.d.click(cancel_btn[0], cancel_btn[1])
-        else:
-            logger.warning("❌ CV 未找到 '取消转文字' 按钮，点击中心区域关闭菜单。")
-            self.d.click(500, 500)
-            
-        return full_text
-
    async def run(self):
-        logger.info("🚀 大张老师自动巡课系统启动...")
-        
-        last_screen_md5 = None
+        logger.info("🚀 大张老师自动巡课系统启动 (CV版)...")
        
        while True:
            try:
-                logger.info("🔍 正在扫描当前界面内容...")
+                # 1. 截图并分析
+                image_path = os.path.join(self.screenshot_dir, "current_screen.jpg")
+                self.d.screenshot(image_path)
                
-                # 1. 截图
-                tmp_shot = os.path.join(self.screenshot_dir, "t6_monitor_temp.jpg")
-                logger.info(f"📸 正在截取屏幕... ({datetime.now().strftime('%H:%M:%S')})")
-                self.d.screenshot(tmp_shot)
+                # 使用 WxUtil 的集中式分析逻辑
+                # 它会自动处理语音转文字，并返回对话列表和输入框坐标
+                dialogue_log, input_pos = await WxUtil.analyze_chat_image(image_path, self.screenshot_dir, device=self.d)
                
-                # 计算 MD5 并去重
-                import hashlib
-                with open(tmp_shot, 'rb') as f:
-                    current_md5 = hashlib.md5(f.read()).hexdigest()
-                    
-                if last_screen_md5 and current_md5 == last_screen_md5:
-                    logger.info("😴 屏幕内容未变，跳过本次循环。")
+                if not dialogue_log:
+                    logger.info("😴 未发现有效消息，等待下一次轮询。")
                    await asyncio.sleep(CHECK_INTERVAL)
                    continue
                
-                last_screen_md5 = current_md5
+                # 2. 只关注最后一条消息
+                last_msg = dialogue_log[-1]
+                logger.info(f"最后一条消息: {last_msg}")
                
-                # 2. 本地视觉分析 (替代 VLM)
-                logger.info("<EFBFBD>️ 正在进行本地视觉扫描...")
+                # 计算最后一条消息的哈希值，用于去重
+                current_msg_hash = hashlib.md5(last_msg.encode('utf-8')).hexdigest()
                
-                # A. 寻找语音图标 (audio.jpg) 和 红点 (red_point.jpg)
-                audio_template = os.path.join(TEMPLATE_DIR, "audio.jpg")
-                red_point_template = os.path.join(TEMPLATE_DIR, "red_point.jpg")
+                # 3. 判断是否需要回复
+                # 规则：最后一条消息由“对方”发送，且不是上一次处理过的消息
+                if "对方:" in last_msg:
+                    if current_msg_hash != self.last_processed_msg_hash:
+                        logger.info(f"💡 发现新消息，准备生成回复: {last_msg}")
                        
-                audio_matches = find_all_template_matches(tmp_shot, audio_template, threshold=0.8)
-                red_points = find_all_template_matches(tmp_shot, red_point_template, threshold=0.8)
+                        # 生成回复
+                        reply = await self.get_reply(last_msg)
                        
-                # B. 本地 OCR 识别全文以构建上下文
-                ocr_results = self.ocr_kit.read_text(tmp_shot)
-                # 按 Y 坐标排序
-                ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2)
-                
-                dialogue_log = []
-                voice_messages = []
-                
-                # 准备可视化调试图
-                import cv2
-                import numpy as np
-                debug_img = cv2.imread(tmp_shot)
-
-                # 记录已匹配到语音图标的 OCR 块索引
-                matched_ocr_indices = set()
-
-                # 先处理语音图标匹配
-                for ax, ay in audio_matches:
-                    # 排除顶部标题栏(0-300)和底部输入区(1800+)
-                    if ay < 300 or ay > 1800: 
-                        logger.info(f"⏭️ 忽略区域外语音图标: ({ax}, {ay})")
-                        continue
-                    
-                    sender = "对方" if ax < 500 else "我"
-                    logger.info(f"🎙️ 发现语音图标: x={ax}, y={ay}, 发送者={sender}")
-                    is_unread = False
-                    if red_points:
-                        for rx, ry in red_points:
-                            # 红点通常在语音图标右侧，且 Y 轴相近
-                            if abs(ry - ay) < 50 and rx > ax:
-                                is_unread = True
-                                # 绘制红点
-                                cv2.circle(debug_img, (int(rx), int(ry)), 12, (0, 0, 255), -1)
-                                break
-                    
-                    # 寻找附近的时长文字 (OCR)
-                    duration_text = "语音"
-                    for idx, (bbox, text, conf) in enumerate(ocr_results):
-                        c_x = (bbox[0][0] + bbox[2][0]) / 2
-                        c_y = (bbox[0][1] + bbox[2][1]) / 2
-                        if abs(c_y - ay) < 40 and abs(c_x - ax) < 300:
-                            if '"' in text or text.isdigit():
-                                duration_text = text
-                                matched_ocr_indices.add(idx)
-                                break
-                    
-                    # 计算点击坐标：直接点击语音图标中心
-                    click_x, click_y = ax, ay
-                    
-                    # 绘制视觉反馈
-                    # 1. 语音图标用绿框
-                    cv2.rectangle(debug_img, (int(ax-30), int(ay-30)), (int(ax+30), int(ay+30)), (0, 255, 0), 3)
-                    # 2. 点击位置用红点 (用户偏好)
-                    cv2.circle(debug_img, (int(click_x), int(click_y)), 15, (0, 0, 255), -1)
-
-                    v_msg = {
-                        "type": "voice",
-                        "content": duration_text,
-                        "coordinates": [click_x, click_y],
-                        "sender": sender,
-                        "is_unread": is_unread
-                    }
-                    if sender == "对方":
-                        voice_messages.append(v_msg)
-                    dialogue_log.append({
-                        "y": ay,
-                        "text": f"{sender}: [语音] {duration_text}",
-                        "is_voice": True,
-                        "id": f"voice_{ax}_{ay}",
-                        "v_msg": v_msg
-                    })
-
-                # 处理剩余的 OCR 文字块 (普通文本)
-                for idx, (bbox, text, conf) in enumerate(ocr_results):
-                    if idx in matched_ocr_indices: continue
-                    
-                    x_min, x_max = bbox[0][0], bbox[2][0]
-                    y_min, y_max = bbox[0][1], bbox[2][1]
-                    c_x, c_y = (x_min + x_max) / 2, (y_min + y_max) / 2
-                    
-                    if c_y < 300 or c_y > 1800: continue
-                    
-                    if x_min < 250 and x_max < 700:
-                        sender, color = "对方", (0, 255, 0)
-                    elif x_max > 800 and x_min > 300:
-                        sender, color = "我", (255, 0, 0)
-                    else:
-                        sender, color = "系统", (128, 128, 128)
-                    
-                    if sender != "系统":
-                        logger.info(f"💬 发现文本消息: x={c_x}, y={c_y}, 发送者={sender}, 内容={text}")
-                        cv2.rectangle(debug_img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), color, 1)
-                        dialogue_log.append({
-                            "y": c_y,
-                            "text": f"{sender}: {text}",
-                            "is_voice": False
-                        })
-
-                # 按 Y 轴重新排序整个对话日志
-                dialogue_log.sort(key=lambda x: x['y'])
-
-                # 保存调试图
-                debug_shot_path = os.path.join(self.screenshot_dir, "t6_debug_view.jpg")
-                cv2.imwrite(debug_shot_path, debug_img)
-                logger.info(f"🎨 已保存视觉调试图: {debug_shot_path}")
-
-                # C. 寻找输入框 (CV 模板匹配)
-                input_template = os.path.join(TEMPLATE_DIR, "input_box.jpg") # 假设有这个模板
-                input_center = find_template_match(tmp_shot, input_template, threshold=0.6)
-                if not input_center:
-                    # 几何兜底：屏幕底部 88% 处
-                    from PIL import Image
-                    with Image.open(tmp_shot) as img:
-                        w, h = img.size
-                        input_center = [w // 2, int(h * 0.88)]
-                        logger.info(f"<EFBFBD> 使用几何兜底输入框坐标: {input_center}")
-
-                # 4. & 5. 统一处理最后一条消息逻辑 (只看最后一条)
-                should_reply = False
-                input_y = input_center[1] if input_center else None
-                
-                if dialogue_log:
-                    last_item = dialogue_log[-1]
-                    last_text = last_item["text"]
-                    # 构造唯一标识符：文本内容 + 坐标 (Y坐标取整到10像素以容纳轻微位移)
-                    current_msg_id = f"{last_text}_{int(last_item['y']/10)*10}"
-                    
-                    # 核心规则：只有当最后一条消息是“对方”说的，且内容未处理过，才回复。
-                    if last_text.startswith("对方"):
-                        if current_msg_id != self.last_processed_msg_id:
-                            logger.info(f"💡 发现新消息: {last_text}")
-                            
-                            # 如果是语音，且需要回复，则先转换
-                            if last_item.get("is_voice"):
-                                v_msg = last_item.get("v_msg")
-                                if v_msg:
-                                    logger.info(f"🎤 最后一条是语音，开始转换: {v_msg['content']}")
-                                    converted_text = await self.process_single_voice(v_msg, None, input_y)
-                                    if converted_text:
-                                        # 更新文本内容以便 LLM 理解
-                                        last_item["text"] = f"对方: [语音转文字: {converted_text}]"
-                                        logger.info(f"✅ 语音转换成功: {converted_text}")
-                                    else:
-                                        logger.warning("⚠️ 语音转换未提取到文字，将尝试直接回复或跳过。")
-                            
-                            should_reply = True
-                        else:
-                            # 消息已处理过
-                            should_reply = False
-                    else:
-                        # 最后一条是我发送的，或者是系统消息
-                        should_reply = False
-                        # 记录一下，避免在没有新消息时重复进入逻辑
-                        if current_msg_id != self.last_processed_msg_id:
-                            self.last_processed_msg_id = current_msg_id
-                            logger.info(f"⚪ 最后一条消息非对方发送，无需回复: {last_text}")
-
-                if should_reply:
-                    logger.info("🤖 准备调用 LLM 生成回复...")
-                    # 立即更新状态，防止在回复生成期间重复触发
-                    self.last_processed_msg_id = current_msg_id
-                    
-                    # 构建完整历史用于上下文
-                    final_dialogue_texts = [item['text'] for item in dialogue_log]
-                    history_text = "\n".join(final_dialogue_texts)
-                    
-                    reply = await self.get_reply(history_text)
                        if reply:
-                        logger.info(f"💡 LLM 回复: {reply}")
-                        if input_center:
-                             perform_input_action(self.d, input_center, reply)
-                             time.sleep(1) # 等待发送完成
+                            logger.info(f"🤖 LLM 回复: {reply}")
+                            # 执行输入和发送
+                            if input_pos:
+                                perform_input_action(self.d, input_pos, reply)
+                                logger.info("✅ 回复已发送")
+                                # 成功发送后更新最后处理的消息哈希
+                                self.last_processed_msg_hash = current_msg_hash
                            else:
-                        logger.warning("⚠️ LLM 未生成有效回复。")
+                                logger.warning("❌ 未找到输入框位置，无法发送回复")
+                        else:
+                            logger.warning("⚠️ LLM 未生成有效回复")
+                    else:
+                        # 消息已处理过，不重复回复
+                        pass
+                else:
+                    # 最后一条是我发送的或者是系统消息，更新哈希以防之后重复处理（如果之后又变成对方发）
+                    # 或者简单地跳过
+                    if current_msg_hash != self.last_processed_msg_hash:
+                        logger.info(f"⚪ 最后一条消息非对方发送，无需回复: {last_msg}")
+                        self.last_processed_msg_hash = current_msg_hash

-                self.is_first_run = False
-                
-                # 休眠
+                # 4. 休眠
                await asyncio.sleep(CHECK_INTERVAL)

            except Exception as e:
--- a/WeiXin/WxUtil.py
+++ b/WeiXin/WxUtil.py
@@ -14,11 +14,9 @@ if project_root not in sys.path:
    sys.path.append(project_root)

 import json
-from Util.VLMKit import VLMKit
 from Util.EasyOcrKit import EasyOcrKit

-# 初始化 VLMKit 和 EasyOcrKit
-vlm_kit = VLMKit()
+# 初始化 EasyOcrKit
 ocr_kit = EasyOcrKit()

 # 配置日志
@@ -72,302 +70,216 @@ def connect_device():
        logger.error(f"设备连接失败: {e}")
        return None

-async def get_vlm_json(image_path, prompt):
-    """
-    通用 VLM 分析函数，返回 JSON 数据 (自动处理归一化坐标的反归一化)
-    """
-    try:
-        # 调用 VLM
-        response = await vlm_kit.analyze_image(image_path, prompt)
-        json_str = vlm_kit.extract_json(response)
-        result_data = json.loads(json_str)
-        
-        # 获取图片尺寸进行坐标反归一化
-        try:
-            from PIL import Image
-            with Image.open(image_path) as img:
-                width, height = img.size
-                
-            # 定义反归一化函数
-            def denormalize(point):
-                if not point or len(point) != 2:
-                    return point
-                return [int(point[0] / 1000 * width), int(point[1] / 1000 * height)]
-                
-            # 递归遍历字典进行反归一化 (仅针对常见坐标字段 center, input_box)
-            def recursive_denormalize(data):
-                if isinstance(data, dict):
-                    for key, value in data.items():
-                        if key in ["center", "input_box", "coordinates"] and isinstance(value, list) and len(value) == 2:
-                            data[key] = denormalize(value)
-                        elif isinstance(value, (dict, list)):
-                            recursive_denormalize(value)
-                elif isinstance(data, list):
-                    for item in data:
-                        recursive_denormalize(item)
-                        
-            recursive_denormalize(result_data)
-                        
-        except Exception as e:
-            logger.warning(f"坐标反归一化失败: {e}，将使用原始坐标")
-            
-        return result_data
-    except Exception as e:
-        logger.error(f"VLM Analysis Failed: {e}", exc_info=True)
-        return None
-
-async def get_vlm_analysis(image_path):
-    """
-    仅调用 VLM 分析图片，返回原始 JSON 数据 (dict)
-    """
-    logger.info(f"正在使用 VLM 分析图片: {image_path}")
-    
-    # 构造 Prompt
-    prompt = """
-    请分析这张微信聊天截图，提取所有对话消息。
-
-    【核心规则 - 优先级最高】
-    1. 🚀 **从下往上扫描**：必须确保屏幕最底部的消息被识别。很多时候最底部的消息是最重要的。
-    2. 🔴 **未读红点 (Unread)**：极度关注语音气泡右上角的红点。如果有红点，`is_unread` 必须为 true。
-    3. 📦 **完整性**：识别图中【所有】可见的消息气泡，包括文本消息、语音消息、系统提示（如“昨天 10:36”、“你撤回了一条消息”）。
-
-    【消息类型判别】
-    - **发送者 (Sender)**：左侧头像为“对方”(Other)，右侧头像为“我”(Me)。
-    - **语音 (Voice)**：
-        - 气泡内只有时长（如 5"）和声波图标。
-        - **重点**：如果语音气泡右侧有灰色的“转文字”字样或红点，且下方没有对应的文本翻译气泡，说明它【尚未转换】。
-        - `status` 判断：只有当语音气泡【正下方】紧跟着一个相同发送者的文本气泡（内容是翻译结果），`status` 才为 "converted"。否则为 "unconverted"。
-    - **文本 (Text)**：气泡内包含具体的文字内容。
-
-    【坐标系统】
-    - 使用 [0-1000] 归一化坐标。返回气泡的几何中心点 `center`。
-    - 识别底部输入框的位置 `input_box`。
-
-    【输出格式】
-    请返回纯 JSON 格式：
-    {
-        "is_chat_interface": true,
-        "input_box": [x, y],
-        "messages": [
-            {
-                "type": "voice" | "text" | "system",
-                "sender": "对方" | "我" | "系统",
-                "status": "converted" | "unconverted", 
-                "is_unread": true | false,
-                "center": [x, y],
-                "content": "消息内容或时长"
-            },
-            ...
-        ]
-    }
-    """
-    2. <EFBFBD> **红点 (Unread)**：极度关注语音气泡右上角的红点。如果有红点，`is_unread` 必须为 true。
-    3. 📦 **完整性**：识别图中【所有】可见的消息气泡。不要遗漏任何一个，特别是连续的语音消息。
-
-    【消息类型判别】
-    - **发送者 (Sender)**：左侧头像为“对方”(Other)，右侧头像为“我”(Me)。
-    - **语音 (Voice)**：气泡内只有时长（如 5"）和声波图标。
-        - 语音气泡右侧可能有“转文字”或“取消”等灰色小字，请忽略这些文字，气泡依然是 Voice。
-        - `status` 判断：如果语音气泡下方【紧接着】有一个属于同一人的文本气泡，且内容看起来像翻译结果，则 `status` 为 "converted"，否则为 "unconverted"。
-    - **文本 (Text)**：气泡内包含具体的文字内容。
-
-    【坐标系统】
-    - 使用 [0-1000] 归一化坐标。返回气泡的几何中心点 `center`。
-    - 识别底部输入框的位置 `input_box`。
-
-    【输出格式】
-    请返回纯 JSON 格式：
-    {
-        "is_chat_interface": true,
-        "input_box": [x, y],
-        "messages": [
-            {
-                "type": "voice",
-                "sender": "对方" | "我",
-                "status": "converted" | "unconverted", 
-                "is_unread": true | false,
-                "center": [x, y],
-                "content": "8\""
-            },
-            ...
-        ]
-    }
-    """
-    
-    try:
-        # 调用 VLM
-        response = await vlm_kit.analyze_image(image_path, prompt)
-        logger.info(f"VLM Raw Response: {response}") # 打印原始响应以便调试
-        json_str = vlm_kit.extract_json(response)
-        result_data = json.loads(json_str)
-        
-        # 获取图片尺寸进行坐标反归一化
-        try:
-            from PIL import Image
-            with Image.open(image_path) as img:
-                width, height = img.size
-                
-            # 定义反归一化函数
-            def denormalize(point):
-                if not point or len(point) != 2:
-                    return point
-                return [int(point[0] / 1000 * width), int(point[1] / 1000 * height)]
-                
-            # 反归一化 input_box
-            if result_data.get("input_box"):
-                result_data["input_box"] = denormalize(result_data["input_box"])
-                
-            # 反归一化 messages
-            if result_data.get("messages"):
-                for msg in result_data["messages"]:
-                    if msg.get("center"):
-                        msg["center"] = denormalize(msg["center"])
-                    if msg.get("coordinates"): # 兼容旧字段
-                        msg["coordinates"] = denormalize(msg["coordinates"])
-                        
-        except Exception as e:
-            logger.warning(f"坐标反归一化失败: {e}，将使用原始坐标")
-            
-        return result_data
-    except Exception as e:
-        logger.error(f"VLM Analysis Failed: {e}", exc_info=True)
-        return None
-
 async def analyze_chat_image(image_path, output_path, device=None, target_name="对方"):
    """
-    使用 VLM 识别微信聊天截图中的对话内容、语音消息状态以及输入框位置
-    替代原本的 CV/OCR 方案
+    全面采用 CV + OCR 识别微信聊天截图中的最后一条消息
+    不再使用 VLM
    """
-    
-    # 语音识别标志
-    should_trigger_convert = False
-    
    try:
-        result_data = await get_vlm_analysis(image_path)
-        
-        if not result_data:
-            return [], None
-
-        try:
-            # 检查是否为聊天界面
-            is_chat = result_data.get("is_chat_interface", False)
-            if not is_chat:
-                logger.warning("VLM 判断当前不是微信聊天界面")
-                return None, None
-                
-            if isinstance(result_data, list):
-                # 兼容旧格式
-                messages = result_data
-                input_field_coordinates = None
-            else:
-                messages = result_data.get("messages", [])
-                input_field_coordinates = result_data.get("input_box") # input_box
-
-        except Exception as e:
-            logger.error(f"解析 VLM 结果失败: {e}")
-            return [], None
-
-        dialogue_log = []
-        unconverted_voices = []
-
-        # 处理识别结果
-        for msg in messages:
-            sender = msg.get('sender', '未知')
-            msg_type = msg.get('type', 'other')
-            content = msg.get('content', '')
-            coords = msg.get('center', [0, 0]) # center
-            status = msg.get('status', 'unconverted')
-            is_unread = msg.get('is_unread', False)
-            is_converted = (status == "converted")
-            
-            unread_mark = "[未读]" if is_unread else ""
-            
-            # 记录对话日志
-            if msg_type == 'voice':
-                if is_converted:
-                     dialogue_log.append(f"{sender}: {unread_mark}[语音] {content} (已转换)")
-                else:
-                     dialogue_log.append(f"{sender}: {unread_mark}[语音] (待转换)")
-                     # 将 center 转换为 coordinates 供后续使用
-                     msg['coordinates'] = coords 
-                     unconverted_voices.append(msg)
-            elif msg_type == 'text':
-                dialogue_log.append(f"{sender}: {content}")
-            
-            logger.info(f"VLM 识别: {sender} [{msg_type}] {content} (Converted: {is_converted}, Unread: {is_unread})")
-
-        # 处理未转换的语音消息
-        if unconverted_voices:
-            # 优先级：1. 有红点的最后一条 2. 没红点的最后一条
-            unread_voices = [v for v in unconverted_voices if v.get('is_unread')]
-            if unread_voices:
-                logger.info(f"发现 {len(unread_voices)} 条未读语音消息，优先处理最后一条...")
-                voice_to_process = unread_voices[-1]
-            else:
-                logger.info(f"发现 {len(unconverted_voices)} 条未转换语音消息，处理最后一条...")
-                voice_to_process = unconverted_voices[-1]
-            
-            # 仅保留选中的一条进行处理
-            unconverted_voices = [voice_to_process]
-            
-            # 使用传入的 device 或创建新连接
+        # 1. 初始化
        d = device if device else connect_device()
+        if not d:
+            return [], None
            
-            for voice in unconverted_voices:
-                vx, vy = voice['coordinates']
-                logger.info(f"长按语音消息: ({vx}, {vy})")
-                d.long_click(vx, vy, 1.5)
-                time.sleep(1.0)
+        # 2. 读取图片
+        img = cv2.imread(image_path)
+        if img is None:
+            logger.error(f"无法读取图片: {image_path}")
+            return [], None
+        h, w = img.shape[:2]
        
-                # 查找“转文字” (使用 OCR)
-                menu_shot_path = os.path.join(os.path.dirname(image_path), "temp_menu_shot.jpg")
-                d.screenshot(menu_shot_path)
+        # 3. 模板匹配寻找语音图标和红点
+        audio_template = os.path.join(TEMPLATE_DIR, "audio.jpg")
+        red_point_template = os.path.join(TEMPLATE_DIR, "red_point.jpg")
        
-                # OCR 识别
-                ocr_results = ocr_kit.read_text(menu_shot_path)
-                convert_btn_center = None
+        audio_matches = find_all_template_matches(image_path, audio_template, threshold=0.8)
+        red_points = find_all_template_matches(image_path, red_point_template, threshold=0.8)
        
-                for bbox, text, conf in ocr_results:
-                    if "转文字" in text or "转换为文字" in text:
-                        # bbox is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
-                        # Calculate center
-                        c_x = int((bbox[0][0] + bbox[2][0]) / 2)
-                        c_y = int((bbox[0][1] + bbox[2][1]) / 2)
-                        convert_btn_center = (c_x, c_y)
+        # 4. OCR 识别所有文本
+        logger.info("正在执行 OCR 识别...")
+        ocr_results = ocr_kit.read_text(image_path)
+        
+        # 5. 整合所有消息
+        messages = []
+        debug_img = img.copy() # 初始化调试图
+
+        # A. 添加语音消息
+        for ax, ay in audio_matches:
+            # 过滤掉顶部和底部的非聊天区域 (经验值: 顶部150, 底部250)
+            if ay < 150 or ay > h - 250:
+                continue
+                
+            sender = "对方" if ax < w / 2 else "我"
+            is_unread = False
+            for rx, ry in red_points:
+                # 红点通常在语音图标右侧且 Y 轴相近
+                if abs(ry - ay) < 50 and rx > ax:
+                    is_unread = True
                    break
            
-                if convert_btn_center:
-                    logger.info(f"OCR 找到 '转文字' 按钮: {convert_btn_center}")
-                    d.click(convert_btn_center[0], convert_btn_center[1])
-                    should_trigger_convert = True
+            # 根据已读/未读画框：未读红框，已读绿框
+            color = (0, 0, 255) if is_unread else (0, 255, 0)
+            cv2.rectangle(debug_img, (ax-30, ay-30), (ax+30, ay+30), color, 2)

-                    # 动态等待: 60s语音约需10s转换，比例约 1/6
-                    duration_str = voice.get('content', '0').replace('"', '').strip()
-                    try:
-                        duration = int(duration_str)
-                    except:
-                        duration = 10 # 默认值
+            # --- 新增：判断是否已转文字 ---
+            is_converted = False
+            for bbox, text, conf in ocr_results:
+                c_x = int((bbox[0][0] + bbox[2][0]) / 2)
+                c_y = int((bbox[0][1] + bbox[2][1]) / 2)
+                # 转换后的文字通常在语音图标下方 30-300 像素内，且水平位置相近
+                if 30 < c_y - ay < 300 and abs(c_x - ax) < 200:
+                    is_converted = True
+                    break
            
-                    wait_seconds = max(2, duration / 5.0) # 稍微多等一点，用 /5.0
-                    logger.info(f"语音时长 {duration}s，预计等待转换 {wait_seconds:.1f}s...")
-                    time.sleep(wait_seconds)
+            label = "YES" if is_converted else "NO"
+            # 在框的右侧标注 YES 或 NO
+            cv2.putText(debug_img, label, (ax + 40, ay + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
+            # --- 结束 ---

+            messages.append({
+                "type": "voice",
+                "sender": sender,
+                "center": (ax, ay),
+                "y": ay,
+                "is_unread": is_unread,
+                "is_converted": is_converted
+            })
+            
+        # B. 添加文本消息
+        # 简单策略：排除掉明显是系统时间、输入框或顶部标题的文字
+        for bbox, text, conf in ocr_results:
+            c_x = int((bbox[0][0] + bbox[2][0]) / 2)
+            c_y = int((bbox[0][1] + bbox[2][1]) / 2)
+            
+            # 过滤区域
+            if c_y < 150 or c_y > h - 250:
+                continue
+            
+            # 过滤掉单字（可能是头像旁边的文字或杂质）和某些系统词
+            if len(text) < 1 and "昨天" not in text and "今天" not in text:
+                continue
+                
+            sender = "对方" if c_x < w / 2 else "我"
+            messages.append({
+                "type": "text",
+                "sender": sender,
+                "content": text,
+                "center": (c_x, c_y),
+                "y": c_y
+            })
+            
+        # 6. 排序并找出最后一条消息
+        if not messages:
+            logger.warning("未发现任何消息")
+            if output_path:
+                cv2.imwrite(output_path, debug_img)
+            return [], None
+            
+        # 按 Y 坐标从上到下排序
+        messages.sort(key=lambda x: x['y'])
+        last_msg = messages[-1]
+        
+        if output_path:
+            cv2.imwrite(output_path, debug_img)
+            logger.info(f"调试图已保存: {output_path}")
+
+        dialogue_log = []
+        input_field_coordinates = (w // 2, int(h * 0.9)) # 默认输入框位置
+        
+        # 7. 自动处理所有“红框 + NO”的语音消息
+        unconverted_voices = [m for m in messages if m['type'] == 'voice' and m.get('is_unread') and not m.get('is_converted')]
+        
+        if unconverted_voices:
+            logger.info(f"发现 {len(unconverted_voices)} 条未转换的未读语音，开始处理...")
+        
+        for v_msg in unconverted_voices:
+            vx, vy = int(v_msg['center'][0]), int(v_msg['center'][1])
+            logger.info(f"--- 正在处理语音消息 ({vx}, {vy}) ---")
+            
+            # A. 长按语音
+            logger.info(f"正在长按语音消息 ({vx}, {vy})...")
+            d.long_click(vx, vy, 1.5)
+            time.sleep(1.5)
+            
+            # B. 截图寻找“转文字”按钮
+            menu_shot = os.path.join(OUTPUT_DIR, f"voice_menu_{vy}.jpg")
+            d.screenshot(menu_shot)
+            zhuan_template = os.path.join(TEMPLATE_DIR, "zhun_wen_zi.jpg")
+            
+            # 降低阈值到 0.7 以增加匹配成功率
+            btn_pos = find_template_match(menu_shot, zhuan_template, threshold=0.7)
+            
+            if btn_pos:
+                btn_x, btn_y = int(btn_pos[0]), int(btn_pos[1])
+                logger.info(f"✅ 找到'转文字'按钮: ({btn_x}, {btn_y})，点击中...")
+                d.click(btn_x, btn_y)
+                
+                # 等待转换完成 (根据语音长度，通常 3-5 秒足够)
+                logger.info("等待语音转文字完成...")
+                time.sleep(5.0)
+                
+                # C. 再次截图 OCR 获取转换后的文字
+                after_convert_shot = os.path.join(OUTPUT_DIR, f"after_auto_{vy}.jpg")
+                d.screenshot(after_convert_shot)
+                convert_ocr = ocr_kit.read_text(after_convert_shot)
+                
+                # 提取转换文字：寻找在语音图标下方的文字块
+                converted_text = ""
+                for c_bbox, c_text, c_conf in convert_ocr:
+                    cc_x = (c_bbox[0][0] + c_bbox[2][0]) / 2
+                    cc_y = (c_bbox[0][1] + c_bbox[2][1]) / 2
+                    # 转换后的文字通常在语音图标下方 30-300 像素内，且水平位置相近
+                    if 30 < cc_y - vy < 300 and abs(cc_x - vx) < 250:
+                        converted_text = c_text
+                        break
+                
+                if converted_text:
+                    logger.info(f"✨ OCR 识别成功!")
+                    print(f"\n[语音转文字结果]: {converted_text}\n")
+                    # 同步到消息对象
+                    v_msg['content'] = converted_text
+                    v_msg['is_converted'] = True
+                    # 如果这条消息也是最后一条消息，更新 dialogue_log 需要的内容
+                    if v_msg == last_msg:
+                        last_msg['content'] = converted_text
                else:
-                    logger.warning("OCR 未找到 '转文字' 菜单项")
-                    # 点击空白处关闭菜单，避免遮挡
-                    d.click(vx + 200, vy) 
+                    logger.warning("❌ OCR 未能提取到转换后的文字内容")
                
-            if should_trigger_convert:
-                # 转换完成后稍微多等一下，确保 UI 刷新
+                # D. 长按并点击“取消转文字”恢复界面
+                logger.info("正在恢复界面状态 (点击'取消转文字')...")
+                d.long_click(vx, vy, 1.5)
                time.sleep(1.0)
-                # 即使触发了转换，我们也返回当前的对话日志，但在日志末尾注明正在转换
-                dialogue_log.append("系统: [正在转换语音...]")
-                return dialogue_log, input_field_coordinates
+                cancel_shot = os.path.join(OUTPUT_DIR, f"cancel_menu_{vy}.jpg")
+                d.screenshot(cancel_shot)
+                cancel_template = os.path.join(TEMPLATE_DIR, "cancel_zhuan_wen_zi.jpg")
+                cancel_btn = find_template_match(cancel_shot, cancel_template, threshold=0.7)
+                
+                if cancel_btn:
+                    c_btn_x, c_btn_y = int(cancel_btn[0]), int(cancel_btn[1])
+                    d.click(c_btn_x, c_btn_y)
+                    logger.info(f"✅ 已点击'取消转文字' ({c_btn_x}, {c_btn_y})，界面已恢复")
+                else:
+                    # 兜底：点击语音图标右侧空白处尝试关闭菜单
+                    logger.warning("⚠️ 未找到'取消转文字'按钮，尝试点击空白处关闭菜单")
+                    d.click(vx + 300, vy)
+            else:
+                logger.warning("❌ 未能找到'转文字'按钮，可能长按失败或模板不匹配")
+                # 尝试点击空白处退出菜单
+                d.click(vx + 300, vy)
+
+        # 8. 整合对话日志 (仅针对最后一条消息进行反馈)
+        dialogue_log = []
+        if last_msg['type'] == 'voice':
+            # 优先使用刚才转文字得到的内容
+            content = last_msg.get('content') or "[语音]"
+            dialogue_log.append(f"{last_msg['sender']}: {content}")
+        else:
+            dialogue_log.append(f"{last_msg['sender']}: {last_msg['content']}")
            
        return dialogue_log, input_field_coordinates
        
    except Exception as e:
-        logger.error(f"VLM 分析失败: {e}", exc_info=True)
+        logger.error(f"analyze_chat_image 失败: {e}", exc_info=True)
        return [], None


--- a/WeiXin/pycache/WxUtil.cpython-310.pyc
+++ b/WeiXin/pycache/WxUtil.cpython-310.pyc