'commit'
This commit is contained in:
@@ -1,75 +0,0 @@
|
||||
# coding=utf-8
|
||||
import time
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from WeiXin import WxUtil
|
||||
from WeiXin.WxUtil import analyze_chat_image
|
||||
|
||||
# 配置日志
|
||||
log_dir = WxUtil.LOG_DIR
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(log_dir, "T2_GetHistory.log"), encoding='utf-8'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("T2_GetHistory")
|
||||
|
||||
async def get_history(target_name="对方"):
|
||||
# 运行前清理 Logs 和 Output
|
||||
WxUtil.setup_script_environment()
|
||||
|
||||
logger.info("开始执行 T2: 获取当前屏幕对话历史...")
|
||||
|
||||
d = WxUtil.connect_device()
|
||||
if not d:
|
||||
return
|
||||
|
||||
# 截图
|
||||
screenshot_dir = WxUtil.OUTPUT_DIR
|
||||
if not os.path.exists(screenshot_dir):
|
||||
os.makedirs(screenshot_dir)
|
||||
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"T2_history_{timestamp}.jpg"
|
||||
save_path = os.path.join(screenshot_dir, filename)
|
||||
|
||||
try:
|
||||
d.screenshot(save_path)
|
||||
logger.info(f"截图已保存: {save_path}")
|
||||
|
||||
analyzed_filename = f"T2_history_{timestamp}_analyzed.jpg"
|
||||
analyzed_path = os.path.join(screenshot_dir, analyzed_filename)
|
||||
|
||||
# 调用 WxUtil 中的分析函数
|
||||
dialogue_log, input_box = await analyze_chat_image(save_path, analyzed_path, device=d, target_name=target_name)
|
||||
|
||||
logger.info("✅ T2 识别结果:")
|
||||
if dialogue_log:
|
||||
for log in dialogue_log:
|
||||
print(log) # 打印到控制台
|
||||
else:
|
||||
logger.info("未提取到对话内容或当前屏幕无对话气泡。")
|
||||
|
||||
# 检查是否触发了转换
|
||||
if isinstance(dialogue_log, list) and any("[正在转换语音...]" in str(msg) for msg in dialogue_log):
|
||||
logger.info("检测到语音正在转文字,建议等待转换完成后重新运行 T2 以获取完整内容。")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ T2 执行失败: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(get_history())
|
||||
@@ -1,135 +0,0 @@
|
||||
# coding=utf-8
|
||||
import time
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
# 添加项目根目录到 sys.path 以便导入 Util
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from WeiXin import WxUtil
|
||||
from WeiXin.WxUtil import find_input_box_center, perform_input_action, analyze_chat_image, clean_screenshots_dir
|
||||
from Util.LlmUtil import get_llm_response
|
||||
|
||||
# 配置日志
|
||||
log_dir = WxUtil.LOG_DIR
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(log_dir, "T3_InputLlmText.log"), encoding='utf-8'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("T3_InputLlmText")
|
||||
|
||||
async def generate_and_input():
|
||||
# 运行前清理 Logs 和 Output
|
||||
WxUtil.setup_script_environment()
|
||||
|
||||
logger.info("开始执行 T3: 生成 LLM 回复并输入...")
|
||||
|
||||
try:
|
||||
# 1. 连接设备
|
||||
d = WxUtil.connect_device()
|
||||
if not d:
|
||||
return
|
||||
|
||||
# 2. 截图并识别对话历史
|
||||
screenshot_dir = WxUtil.OUTPUT_DIR
|
||||
|
||||
tmp_shot = os.path.join(screenshot_dir, "t4_temp_history_check.jpg")
|
||||
analyzed_shot = os.path.join(screenshot_dir, "t4_temp_history_analyzed.jpg")
|
||||
|
||||
d.screenshot(tmp_shot)
|
||||
dialogue_log, input_box = await analyze_chat_image(tmp_shot, analyzed_shot, device=d)
|
||||
|
||||
# 检查是否包含正在转换的标识
|
||||
is_converting = any("[正在转换语音...]" in str(msg) for msg in dialogue_log) if isinstance(dialogue_log, list) else False
|
||||
|
||||
if is_converting:
|
||||
logger.info("检测到语音正在转文字,等待 3 秒后重新截图分析...")
|
||||
await asyncio.sleep(3)
|
||||
d.screenshot(tmp_shot)
|
||||
dialogue_log, input_box = await analyze_chat_image(tmp_shot, analyzed_shot, device=d)
|
||||
|
||||
# 无论第二次结果如何,我们都继续执行,不再跳过
|
||||
if any("[正在转换语音...]" in str(msg) for msg in dialogue_log) if isinstance(dialogue_log, list) else False:
|
||||
logger.warning("语音转换时间较长,将尝试根据当前已有内容生成回复。")
|
||||
|
||||
history_text = ""
|
||||
if dialogue_log and isinstance(dialogue_log, list):
|
||||
# 过滤掉系统的转换提示语,避免干扰 LLM
|
||||
filtered_log = [msg for msg in dialogue_log if "[正在转换语音...]" not in str(msg)]
|
||||
history_text = "\n".join(filtered_log)
|
||||
logger.info("提取到对话历史: ")
|
||||
for msg in filtered_log:
|
||||
logger.info(msg)
|
||||
elif dialogue_log:
|
||||
history_text = str(dialogue_log)
|
||||
logger.info(f"提取到对话历史: {history_text}")
|
||||
else:
|
||||
logger.warning("未提取到对话历史")
|
||||
history_text = "(无对话历史)"
|
||||
|
||||
# 3. 调用 LLM 生成回复
|
||||
persona = (
|
||||
"你是一名1999年毕业、拥有27年一线教学经验的小学高级教师,名叫‘大张老师’。你目前在‘长春市少惠林作文素养培养中心’工作。"
|
||||
"你不仅是一位作文教学专家,批阅过超过2万篇作文,更是一位懂得孩子心理、能与家长共情的教育智者。"
|
||||
"你的回复风格应该是:温厚、亲切、睿智且极具亲和力,就像一位老邻居、老大哥在和家长坐在长椅上聊天。"
|
||||
"严禁使用任何列表格式(如 1. 2. 3. 或 A. B. C.),严禁使用‘首先、其次、最后’等刻板的逻辑词。"
|
||||
"要用连贯、优美、富有生活气息的文字进行叙述。每一句话都要带温度,要善于用‘咱们孩子’、‘作为老师我也理解’等词汇拉近距离。"
|
||||
"回复内容要有深度,不要只给结论,要讲透背后的教育逻辑。字数必须严格控制在 100-200 字之间。"
|
||||
"如果涉及到校区信息,必须且只能使用以下真实数据,绝对严禁使用任何占位符:\n"
|
||||
"- 单位:长春市少惠林作文素养培养中心\n"
|
||||
"- 地址:南环城路与临河街交汇,TOUCH12街3楼325号\n"
|
||||
"- 联系人:小张老师(电话:18686619970)\n"
|
||||
"- 课程:线上/线下作文课、读书营/阅读策略营(假期开设)\n"
|
||||
"- 上课:作文通常在周二/周四晚上,周六/周日全天"
|
||||
)
|
||||
prompt = (
|
||||
f"【教师人设】:{persona}\n\n"
|
||||
f"【近期聊天记录】:\n{history_text}\n\n"
|
||||
"【任务要求】:\n"
|
||||
"请作为大张老师,给家长写一段暖心且有深度的回复。要针对家长最后提到的问题或状态,先给予情感上的安抚和理解,"
|
||||
"然后再结合你27年的教学经验,给出具体的建议。建议要讲得细致、感人,展现出老教师的智慧沉淀。"
|
||||
"全文必须是一个或两个完整的自然段,绝对禁止分点列项!字数严格在 100-200 字之间。"
|
||||
"如果需要提供联系方式或地址,请自然地揉进段落中,严禁出现'XX'占位符。直接输出回复的正文内容,不要包含任何多余的修饰词或引号。"
|
||||
)
|
||||
logger.info(f"正在以'亲切而专业的特级教师'身份请求 LLM 生成深度回复...")
|
||||
|
||||
full_response = ""
|
||||
async for chunk in get_llm_response(prompt, stream=False):
|
||||
full_response += chunk
|
||||
|
||||
llm_text = full_response.strip().strip('"').strip('“').strip('”')
|
||||
logger.info(f"LLM 生成的回复内容: {llm_text}")
|
||||
|
||||
if not llm_text:
|
||||
logger.error("LLM 生成内容为空,停止执行。")
|
||||
return
|
||||
|
||||
# 4. 识别输入框位置
|
||||
center_point, rect_box = find_input_box_center(tmp_shot)
|
||||
|
||||
# 5. 执行输入动作
|
||||
# 即使 center_point 为 None,perform_input_action 也会尝试通过原生控件识别输入框
|
||||
logger.info(f"正在准备输入回复内容...")
|
||||
success = perform_input_action(d, center_point, llm_text, auto_send=True)
|
||||
|
||||
if success:
|
||||
logger.info("✅ T3 执行完成:文字已成功输入并点击发送。")
|
||||
else:
|
||||
logger.error("❌ T3 执行失败:输入动作未成功完成。")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ T3 执行出错: {e}", exc_info=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(generate_and_input())
|
||||
@@ -30,7 +30,9 @@ logging.basicConfig(
|
||||
logger = logging.getLogger("T4_CV_Voice_Debug")
|
||||
|
||||
|
||||
def run_cv_debug():
|
||||
import asyncio
|
||||
|
||||
async def run_cv_debug():
|
||||
# 运行前清理 Logs 和 Output
|
||||
WxUtil.setup_script_environment()
|
||||
|
||||
@@ -42,7 +44,6 @@ def run_cv_debug():
|
||||
|
||||
try:
|
||||
screenshot_dir = WxUtil.OUTPUT_DIR
|
||||
|
||||
image_path = os.path.join(screenshot_dir, "t4_live_shot.jpg")
|
||||
output_path = os.path.join(screenshot_dir, "T4_debug_view.jpg")
|
||||
|
||||
@@ -52,67 +53,20 @@ def run_cv_debug():
|
||||
logger.error(f"❌ 拍照失败: {e}")
|
||||
return
|
||||
|
||||
logger.info(f"🔍 正在分析实时图片...")
|
||||
logger.info(f"🔍 正在调用 WxUtil.analyze_chat_image 分析最后一条消息...")
|
||||
|
||||
# 模板路径
|
||||
template_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Templates")
|
||||
audio_template = os.path.join(template_dir, "audio.jpg")
|
||||
red_point_template = os.path.join(template_dir, "red_point.jpg")
|
||||
# 2. 调用新的分析逻辑
|
||||
dialogue_log, input_pos = await WxUtil.analyze_chat_image(image_path, output_path, device=d)
|
||||
|
||||
if not os.path.exists(audio_template) or not os.path.exists(red_point_template):
|
||||
logger.error("错误: 模板文件不存在")
|
||||
return
|
||||
if dialogue_log:
|
||||
logger.info("📢 识别到的最后一条消息:")
|
||||
for line in dialogue_log:
|
||||
logger.info(f" {line}")
|
||||
else:
|
||||
logger.warning("⚠️ 未识别到任何消息")
|
||||
|
||||
# 2. 识别逻辑
|
||||
audio_matches = find_all_template_matches(image_path, audio_template, threshold=0.8)
|
||||
red_points = find_all_template_matches(image_path, red_point_template, threshold=0.8)
|
||||
|
||||
logger.info(f"发现语音图标数量: {len(audio_matches)}")
|
||||
logger.info(f"发现红点数量: {len(red_points)}")
|
||||
|
||||
# 3. 读取图片并绘制
|
||||
img = cv2.imread(image_path)
|
||||
if img is None:
|
||||
logger.error("错误: 无法读取图片")
|
||||
return
|
||||
|
||||
for ax, ay in audio_matches:
|
||||
# 排除顶部标题栏和底部输入区 (假设 300-1800 为有效区)
|
||||
if ay < 300 or ay > 1800:
|
||||
continue
|
||||
|
||||
sender = "对方" if ax < 500 else "我"
|
||||
|
||||
# --- 1. 先判断是否未读 (寻找附近的红点) ---
|
||||
is_unread = False
|
||||
for rx, ry in red_points:
|
||||
if abs(ry - ay) < 50 and rx > ax:
|
||||
is_unread = True
|
||||
break
|
||||
|
||||
# --- 2. 根据状态选择颜色 ---
|
||||
# BGR 格式: 红色 (0, 0, 255), 绿色 (0, 255, 0)
|
||||
color = (0, 0, 255) if is_unread else (0, 255, 0)
|
||||
status_text = "未读" if is_unread else "已读"
|
||||
|
||||
# --- 3. 绘制标注 ---
|
||||
# 语音图标框 (加粗)
|
||||
cv2.rectangle(img, (int(ax-35), int(ay-35)), (int(ax+35), int(ay+35)), color, 3)
|
||||
|
||||
# 中心点击位置 (实心圆)
|
||||
cv2.circle(img, (int(ax), int(ay)), 15, color, -1)
|
||||
|
||||
# 如果是未读,把原本识别到的红点也再次标出
|
||||
if is_unread:
|
||||
for rx, ry in red_points:
|
||||
if abs(ry - ay) < 50 and rx > ax:
|
||||
cv2.circle(img, (int(rx), int(ry)), 12, (0, 0, 255), -1)
|
||||
|
||||
logger.info(f"标注语音消息: ({ax}, {ay}), 发送者: {sender}, 状态: {status_text}")
|
||||
|
||||
# 保存结果
|
||||
cv2.imwrite(output_path, img)
|
||||
logger.info(f"✅ 调试图片已保存至: {output_path}")
|
||||
if input_pos:
|
||||
logger.info(f"📍 识别到输入框位置: {input_pos}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_cv_debug()
|
||||
asyncio.run(run_cv_debug())
|
||||
|
||||
@@ -5,6 +5,7 @@ import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
|
||||
# 添加项目根目录到 sys.path
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -12,11 +13,9 @@ if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Util import Win32Patch
|
||||
|
||||
from WeiXin import WxUtil
|
||||
from WeiXin.WxUtil import perform_input_action, clean_screenshots_dir, find_template_match, find_all_template_matches
|
||||
from WeiXin.WxUtil import perform_input_action
|
||||
from Util.LlmUtil import get_llm_response
|
||||
from Util.EasyOcrKit import EasyOcrKit
|
||||
|
||||
# 配置日志
|
||||
log_dir = WxUtil.LOG_DIR
|
||||
@@ -27,30 +26,23 @@ if not os.path.exists(log_dir):
|
||||
logger = logging.getLogger("T5_AutoChatMonitor")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 清除现有的 handlers,防止重复打印或配置冲突
|
||||
if logger.hasHandlers():
|
||||
logger.handlers.clear()
|
||||
|
||||
# 创建 FileHandler
|
||||
log_file_path = os.path.join(log_dir, "T5_AutoChatMonitor.log")
|
||||
file_handler = logging.FileHandler(log_file_path, encoding='utf-8', mode='w')
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 创建 StreamHandler
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
# 防止日志传播到 root logger,避免重复输出
|
||||
logger.propagate = False
|
||||
|
||||
# 打印日志文件位置,方便确认
|
||||
logger.info(f"日志文件路径: {log_file_path}")
|
||||
|
||||
# 配置参数
|
||||
CHECK_INTERVAL = 5 # 检查频率 (秒)
|
||||
TEMPLATE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Templates")
|
||||
|
||||
class ChatBot:
|
||||
def __init__(self):
|
||||
@@ -60,14 +52,10 @@ class ChatBot:
|
||||
self.d = WxUtil.connect_device()
|
||||
if not self.d:
|
||||
raise Exception("无法连接到设备,任务终止")
|
||||
self.last_message_text = ""
|
||||
self.last_processed_msg_id = None # 记录上一条已处理的消息标识 (文本+坐标)
|
||||
|
||||
self.last_processed_msg_hash = None # 记录最后一条已处理消息的哈希值
|
||||
self.screenshot_dir = WxUtil.OUTPUT_DIR
|
||||
|
||||
self.ocr_kit = EasyOcrKit(gpu=True)
|
||||
|
||||
self.is_first_run = True # 首次运行标志
|
||||
|
||||
self.persona = (
|
||||
"你是一名1999年毕业、拥有27年一线教学经验的小学高级女教师,名叫‘大张老师’。你目前在‘长春市少惠林作文素养培养中心’工作。"
|
||||
"你不仅是一位作文教学专家,更是一位心思细腻、能与家长共情的教育智者。"
|
||||
@@ -85,13 +73,12 @@ class ChatBot:
|
||||
"- 每学期开学招收小学三年级至六年级,初中七年级的学生入学,其它年段不招生。\n"
|
||||
)
|
||||
|
||||
async def get_reply(self, history_text):
|
||||
async def get_reply(self, last_message_text):
|
||||
prompt = (
|
||||
f"【教师人设】:{self.persona}\n\n"
|
||||
f"【近期聊天记录】:\n{history_text}\n\n"
|
||||
f"【最后一条消息】:\n{last_message_text}\n\n"
|
||||
"【任务要求】:\n"
|
||||
"请作为大张老师回复家长。**必须且只能针对聊天记录中的最后一条消息进行回复!**\n"
|
||||
"之前的聊天记录仅供参考上下文,如果之前的问题已经回答过,绝对不要重复回答。\n"
|
||||
"请作为大张老师回复家长。**必须且只能针对最后一条消息进行回复!**\n"
|
||||
"严禁发散,严禁编造家长没说过的情况。如果不清楚家长的意图,就温柔询问。\n"
|
||||
"字数严格控制在 50 字以内。直接输出回复正文。"
|
||||
)
|
||||
@@ -101,367 +88,63 @@ class ChatBot:
|
||||
full_response += chunk
|
||||
return full_response.strip().strip('"').strip('“').strip('”')
|
||||
|
||||
async def process_single_voice(self, voice_msg, next_msg=None, input_box_y=None):
|
||||
"""
|
||||
处理单个语音消息的完整流程:
|
||||
长按 -> CV找转文字 -> 点击 -> 等待 -> 截图OCR -> 长按 -> CV找取消转文字 -> 点击
|
||||
返回: 转换后的文本内容 (如果没有转换成功,返回 None)
|
||||
"""
|
||||
vx, vy = voice_msg['coordinates']
|
||||
content = voice_msg.get('content', '0"')
|
||||
logger.info(f"🎤 开始处理语音消息: {content}, 坐标: ({vx}, {vy})")
|
||||
|
||||
try:
|
||||
# 1. 长按语音消息
|
||||
logger.info("👆 正在长按语音消息...")
|
||||
self.d.long_click(vx, vy, 0.6)
|
||||
logger.info("✅ 长按完成,等待菜单...")
|
||||
time.sleep(0.3)
|
||||
|
||||
# 2. CV 模板匹配寻找 "转文字" 按钮
|
||||
menu_shot_path = os.path.join(self.screenshot_dir, "t6_menu_shot_convert.jpg")
|
||||
logger.info(f"📸 截取菜单图: {menu_shot_path}")
|
||||
self.d.screenshot(menu_shot_path)
|
||||
|
||||
convert_template = os.path.join(TEMPLATE_DIR, "zhun_wen_zi.jpg")
|
||||
logger.info(f"🔍 寻找模板: {convert_template}")
|
||||
convert_btn = find_template_match(menu_shot_path, convert_template, threshold=0.6)
|
||||
|
||||
if not convert_btn:
|
||||
logger.warning("❌ CV 未找到 '转文字' 按钮,尝试小范围 OCR 兜底...")
|
||||
# 尝试在该区域进行 OCR 识别,寻找 "转文字" 三个字
|
||||
ocr_results_menu = self.ocr_kit.read_text(menu_shot_path)
|
||||
for bbox, text, conf in ocr_results_menu:
|
||||
if "转文字" in text or "转文" in text or "文字" in text:
|
||||
cx = (bbox[0][0] + bbox[2][0]) / 2
|
||||
cy = (bbox[0][1] + bbox[2][1]) / 2
|
||||
convert_btn = (cx, cy)
|
||||
logger.info(f"✅ OCR 兜底找到 '转文字' 按钮: {convert_btn}")
|
||||
break
|
||||
|
||||
if not convert_btn:
|
||||
logger.warning("❌ CV 和 OCR 均未找到 '转文字' 按钮,取消操作。")
|
||||
# 点击屏幕中心区域的空白处关闭菜单,避免点到顶部返回键
|
||||
self.d.click(500, 500)
|
||||
return None
|
||||
|
||||
logger.info(f"✅ 最终找到 '转文字' 按钮坐标: {convert_btn}")
|
||||
self.d.click(convert_btn[0], convert_btn[1])
|
||||
|
||||
# 3. 动态等待转换
|
||||
duration_str = content.replace('"', '').strip()
|
||||
try:
|
||||
duration = int(duration_str)
|
||||
except:
|
||||
duration = 10
|
||||
wait_seconds = max(2, duration / 5.0)
|
||||
logger.info(f"⏳ 语音时长 {duration}s,等待转换 {wait_seconds:.1f}s...")
|
||||
time.sleep(wait_seconds)
|
||||
|
||||
# 4. 截图并 OCR 识别内容
|
||||
ocr_shot_path = os.path.join(self.screenshot_dir, "t6_ocr_shot.jpg")
|
||||
logger.info(f"📸 截取 OCR 识别图: {ocr_shot_path}")
|
||||
self.d.screenshot(ocr_shot_path)
|
||||
|
||||
# OCR 识别
|
||||
# 策略:识别整个屏幕,但只提取位于当前语音消息下方,且在下一条消息(如果有)上方的内容
|
||||
logger.info("📖 开始 OCR 识别...")
|
||||
ocr_results = self.ocr_kit.read_text(ocr_shot_path)
|
||||
logger.info(f"✅ OCR 识别完成,获取 {len(ocr_results)} 个文本块")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ process_single_voice 发生异常: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
# 按 Y 坐标排序,确保从上往下处理
|
||||
ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2)
|
||||
|
||||
extracted_text = []
|
||||
|
||||
# 准备下一条消息的内容片段作为停止条件
|
||||
next_msg_snippet = None
|
||||
if next_msg and next_msg.get("type") == "text":
|
||||
c = next_msg.get("content", "").strip()
|
||||
if c:
|
||||
next_msg_snippet = c[:8] # 取前8个字符作为指纹
|
||||
|
||||
for bbox, text, conf in ocr_results:
|
||||
# bbox center y
|
||||
c_y = (bbox[0][1] + bbox[2][1]) / 2
|
||||
|
||||
# 1. 过滤掉当前语音气泡及以上的内容
|
||||
# 语音气泡中心是 vy,底部大概在 vy + 30 左右
|
||||
if c_y <= vy + 25:
|
||||
continue
|
||||
|
||||
# 2. 如果有输入框坐标,过滤掉输入框以下的内容
|
||||
if input_box_y and c_y >= input_box_y - 30:
|
||||
continue
|
||||
|
||||
# 3. 如果遇到下一条消息的内容,停止读取
|
||||
if next_msg_snippet and next_msg_snippet in text:
|
||||
logger.info(f"🛑 遇到下一条消息内容 '{text}',停止 OCR 录入。")
|
||||
break
|
||||
|
||||
# 4. 如果下一条是语音,尝试通过时长文本判断停止
|
||||
if next_msg and next_msg.get("type") == "voice":
|
||||
v_dur = next_msg.get("content", "").strip()
|
||||
# 语音时长通常比较短,且包含 " 符号
|
||||
if v_dur and v_dur in text and len(text) < 10:
|
||||
logger.info(f"🛑 遇到下一条语音时长 '{text}',停止 OCR 录入。")
|
||||
break
|
||||
|
||||
# 5. 安全兜底:如果距离当前语音气泡太远(超过600像素),停止
|
||||
# 这可以防止读取到屏幕底部无关的内容
|
||||
if c_y > vy + 600:
|
||||
break
|
||||
|
||||
extracted_text.append(text)
|
||||
|
||||
full_text = " ".join(extracted_text)
|
||||
logger.info(f"📝 OCR 识别结果: {full_text}")
|
||||
|
||||
# 5. 再次长按语音消息 (为了取消转换)
|
||||
# 注意:转换出文字后,界面可能会发生位移。
|
||||
# 但通常语音气泡的相对位置(如果是最后一条)可能变化不大,或者我们假设用户不滑动
|
||||
# 更稳妥的是:重新识别一次语音气泡位置?
|
||||
# 用户说:"这样原来什么样,识别完就是什么样",意味着我们要恢复原状。
|
||||
# 我们假设点击原来的位置还能点到语音气泡(如果它没被顶上去太多)
|
||||
# 或者,我们可以点击转换出来的文字区域?
|
||||
# 让我们尝试点击原来的坐标。
|
||||
|
||||
self.d.long_click(vx, vy, 0.6)
|
||||
time.sleep(0.3)
|
||||
|
||||
# 6. CV 模板匹配寻找 "取消转文字" 按钮
|
||||
menu_shot_path_cancel = os.path.join(self.screenshot_dir, "t6_menu_shot_cancel.jpg")
|
||||
self.d.screenshot(menu_shot_path_cancel)
|
||||
|
||||
cancel_template = os.path.join(TEMPLATE_DIR, "cancel_zhuan_wen_zi.jpg")
|
||||
cancel_btn = find_template_match(menu_shot_path_cancel, cancel_template, threshold=0.6)
|
||||
|
||||
if cancel_btn:
|
||||
logger.info(f"✅ CV 找到 '取消转文字' 按钮: {cancel_btn}")
|
||||
self.d.click(cancel_btn[0], cancel_btn[1])
|
||||
else:
|
||||
logger.warning("❌ CV 未找到 '取消转文字' 按钮,点击中心区域关闭菜单。")
|
||||
self.d.click(500, 500)
|
||||
|
||||
return full_text
|
||||
|
||||
async def run(self):
|
||||
logger.info("🚀 大张老师自动巡课系统启动...")
|
||||
|
||||
last_screen_md5 = None
|
||||
logger.info("🚀 大张老师自动巡课系统启动 (CV版)...")
|
||||
|
||||
while True:
|
||||
try:
|
||||
logger.info("🔍 正在扫描当前界面内容...")
|
||||
# 1. 截图并分析
|
||||
image_path = os.path.join(self.screenshot_dir, "current_screen.jpg")
|
||||
self.d.screenshot(image_path)
|
||||
|
||||
# 1. 截图
|
||||
tmp_shot = os.path.join(self.screenshot_dir, "t6_monitor_temp.jpg")
|
||||
logger.info(f"📸 正在截取屏幕... ({datetime.now().strftime('%H:%M:%S')})")
|
||||
self.d.screenshot(tmp_shot)
|
||||
# 使用 WxUtil 的集中式分析逻辑
|
||||
# 它会自动处理语音转文字,并返回对话列表和输入框坐标
|
||||
dialogue_log, input_pos = await WxUtil.analyze_chat_image(image_path, self.screenshot_dir, device=self.d)
|
||||
|
||||
# 计算 MD5 并去重
|
||||
import hashlib
|
||||
with open(tmp_shot, 'rb') as f:
|
||||
current_md5 = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
if last_screen_md5 and current_md5 == last_screen_md5:
|
||||
logger.info("😴 屏幕内容未变,跳过本次循环。")
|
||||
if not dialogue_log:
|
||||
logger.info("😴 未发现有效消息,等待下一次轮询。")
|
||||
await asyncio.sleep(CHECK_INTERVAL)
|
||||
continue
|
||||
|
||||
last_screen_md5 = current_md5
|
||||
# 2. 只关注最后一条消息
|
||||
last_msg = dialogue_log[-1]
|
||||
logger.info(f"最后一条消息: {last_msg}")
|
||||
|
||||
# 2. 本地视觉分析 (替代 VLM)
|
||||
logger.info("<EFBFBD>️ 正在进行本地视觉扫描...")
|
||||
# 计算最后一条消息的哈希值,用于去重
|
||||
current_msg_hash = hashlib.md5(last_msg.encode('utf-8')).hexdigest()
|
||||
|
||||
# A. 寻找语音图标 (audio.jpg) 和 红点 (red_point.jpg)
|
||||
audio_template = os.path.join(TEMPLATE_DIR, "audio.jpg")
|
||||
red_point_template = os.path.join(TEMPLATE_DIR, "red_point.jpg")
|
||||
# 3. 判断是否需要回复
|
||||
# 规则:最后一条消息由“对方”发送,且不是上一次处理过的消息
|
||||
if "对方:" in last_msg:
|
||||
if current_msg_hash != self.last_processed_msg_hash:
|
||||
logger.info(f"💡 发现新消息,准备生成回复: {last_msg}")
|
||||
|
||||
audio_matches = find_all_template_matches(tmp_shot, audio_template, threshold=0.8)
|
||||
red_points = find_all_template_matches(tmp_shot, red_point_template, threshold=0.8)
|
||||
# 生成回复
|
||||
reply = await self.get_reply(last_msg)
|
||||
|
||||
# B. 本地 OCR 识别全文以构建上下文
|
||||
ocr_results = self.ocr_kit.read_text(tmp_shot)
|
||||
# 按 Y 坐标排序
|
||||
ocr_results.sort(key=lambda x: (x[0][0][1] + x[0][2][1]) / 2)
|
||||
|
||||
dialogue_log = []
|
||||
voice_messages = []
|
||||
|
||||
# 准备可视化调试图
|
||||
import cv2
|
||||
import numpy as np
|
||||
debug_img = cv2.imread(tmp_shot)
|
||||
|
||||
# 记录已匹配到语音图标的 OCR 块索引
|
||||
matched_ocr_indices = set()
|
||||
|
||||
# 先处理语音图标匹配
|
||||
for ax, ay in audio_matches:
|
||||
# 排除顶部标题栏(0-300)和底部输入区(1800+)
|
||||
if ay < 300 or ay > 1800:
|
||||
logger.info(f"⏭️ 忽略区域外语音图标: ({ax}, {ay})")
|
||||
continue
|
||||
|
||||
sender = "对方" if ax < 500 else "我"
|
||||
logger.info(f"🎙️ 发现语音图标: x={ax}, y={ay}, 发送者={sender}")
|
||||
is_unread = False
|
||||
if red_points:
|
||||
for rx, ry in red_points:
|
||||
# 红点通常在语音图标右侧,且 Y 轴相近
|
||||
if abs(ry - ay) < 50 and rx > ax:
|
||||
is_unread = True
|
||||
# 绘制红点
|
||||
cv2.circle(debug_img, (int(rx), int(ry)), 12, (0, 0, 255), -1)
|
||||
break
|
||||
|
||||
# 寻找附近的时长文字 (OCR)
|
||||
duration_text = "语音"
|
||||
for idx, (bbox, text, conf) in enumerate(ocr_results):
|
||||
c_x = (bbox[0][0] + bbox[2][0]) / 2
|
||||
c_y = (bbox[0][1] + bbox[2][1]) / 2
|
||||
if abs(c_y - ay) < 40 and abs(c_x - ax) < 300:
|
||||
if '"' in text or text.isdigit():
|
||||
duration_text = text
|
||||
matched_ocr_indices.add(idx)
|
||||
break
|
||||
|
||||
# 计算点击坐标:直接点击语音图标中心
|
||||
click_x, click_y = ax, ay
|
||||
|
||||
# 绘制视觉反馈
|
||||
# 1. 语音图标用绿框
|
||||
cv2.rectangle(debug_img, (int(ax-30), int(ay-30)), (int(ax+30), int(ay+30)), (0, 255, 0), 3)
|
||||
# 2. 点击位置用红点 (用户偏好)
|
||||
cv2.circle(debug_img, (int(click_x), int(click_y)), 15, (0, 0, 255), -1)
|
||||
|
||||
v_msg = {
|
||||
"type": "voice",
|
||||
"content": duration_text,
|
||||
"coordinates": [click_x, click_y],
|
||||
"sender": sender,
|
||||
"is_unread": is_unread
|
||||
}
|
||||
if sender == "对方":
|
||||
voice_messages.append(v_msg)
|
||||
dialogue_log.append({
|
||||
"y": ay,
|
||||
"text": f"{sender}: [语音] {duration_text}",
|
||||
"is_voice": True,
|
||||
"id": f"voice_{ax}_{ay}",
|
||||
"v_msg": v_msg
|
||||
})
|
||||
|
||||
# 处理剩余的 OCR 文字块 (普通文本)
|
||||
for idx, (bbox, text, conf) in enumerate(ocr_results):
|
||||
if idx in matched_ocr_indices: continue
|
||||
|
||||
x_min, x_max = bbox[0][0], bbox[2][0]
|
||||
y_min, y_max = bbox[0][1], bbox[2][1]
|
||||
c_x, c_y = (x_min + x_max) / 2, (y_min + y_max) / 2
|
||||
|
||||
if c_y < 300 or c_y > 1800: continue
|
||||
|
||||
if x_min < 250 and x_max < 700:
|
||||
sender, color = "对方", (0, 255, 0)
|
||||
elif x_max > 800 and x_min > 300:
|
||||
sender, color = "我", (255, 0, 0)
|
||||
else:
|
||||
sender, color = "系统", (128, 128, 128)
|
||||
|
||||
if sender != "系统":
|
||||
logger.info(f"💬 发现文本消息: x={c_x}, y={c_y}, 发送者={sender}, 内容={text}")
|
||||
cv2.rectangle(debug_img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), color, 1)
|
||||
dialogue_log.append({
|
||||
"y": c_y,
|
||||
"text": f"{sender}: {text}",
|
||||
"is_voice": False
|
||||
})
|
||||
|
||||
# 按 Y 轴重新排序整个对话日志
|
||||
dialogue_log.sort(key=lambda x: x['y'])
|
||||
|
||||
# 保存调试图
|
||||
debug_shot_path = os.path.join(self.screenshot_dir, "t6_debug_view.jpg")
|
||||
cv2.imwrite(debug_shot_path, debug_img)
|
||||
logger.info(f"🎨 已保存视觉调试图: {debug_shot_path}")
|
||||
|
||||
# C. 寻找输入框 (CV 模板匹配)
|
||||
input_template = os.path.join(TEMPLATE_DIR, "input_box.jpg") # 假设有这个模板
|
||||
input_center = find_template_match(tmp_shot, input_template, threshold=0.6)
|
||||
if not input_center:
|
||||
# 几何兜底:屏幕底部 88% 处
|
||||
from PIL import Image
|
||||
with Image.open(tmp_shot) as img:
|
||||
w, h = img.size
|
||||
input_center = [w // 2, int(h * 0.88)]
|
||||
logger.info(f"<EFBFBD> 使用几何兜底输入框坐标: {input_center}")
|
||||
|
||||
# 4. & 5. 统一处理最后一条消息逻辑 (只看最后一条)
|
||||
should_reply = False
|
||||
input_y = input_center[1] if input_center else None
|
||||
|
||||
if dialogue_log:
|
||||
last_item = dialogue_log[-1]
|
||||
last_text = last_item["text"]
|
||||
# 构造唯一标识符:文本内容 + 坐标 (Y坐标取整到10像素以容纳轻微位移)
|
||||
current_msg_id = f"{last_text}_{int(last_item['y']/10)*10}"
|
||||
|
||||
# 核心规则:只有当最后一条消息是“对方”说的,且内容未处理过,才回复。
|
||||
if last_text.startswith("对方"):
|
||||
if current_msg_id != self.last_processed_msg_id:
|
||||
logger.info(f"💡 发现新消息: {last_text}")
|
||||
|
||||
# 如果是语音,且需要回复,则先转换
|
||||
if last_item.get("is_voice"):
|
||||
v_msg = last_item.get("v_msg")
|
||||
if v_msg:
|
||||
logger.info(f"🎤 最后一条是语音,开始转换: {v_msg['content']}")
|
||||
converted_text = await self.process_single_voice(v_msg, None, input_y)
|
||||
if converted_text:
|
||||
# 更新文本内容以便 LLM 理解
|
||||
last_item["text"] = f"对方: [语音转文字: {converted_text}]"
|
||||
logger.info(f"✅ 语音转换成功: {converted_text}")
|
||||
else:
|
||||
logger.warning("⚠️ 语音转换未提取到文字,将尝试直接回复或跳过。")
|
||||
|
||||
should_reply = True
|
||||
else:
|
||||
# 消息已处理过
|
||||
should_reply = False
|
||||
else:
|
||||
# 最后一条是我发送的,或者是系统消息
|
||||
should_reply = False
|
||||
# 记录一下,避免在没有新消息时重复进入逻辑
|
||||
if current_msg_id != self.last_processed_msg_id:
|
||||
self.last_processed_msg_id = current_msg_id
|
||||
logger.info(f"⚪ 最后一条消息非对方发送,无需回复: {last_text}")
|
||||
|
||||
if should_reply:
|
||||
logger.info("🤖 准备调用 LLM 生成回复...")
|
||||
# 立即更新状态,防止在回复生成期间重复触发
|
||||
self.last_processed_msg_id = current_msg_id
|
||||
|
||||
# 构建完整历史用于上下文
|
||||
final_dialogue_texts = [item['text'] for item in dialogue_log]
|
||||
history_text = "\n".join(final_dialogue_texts)
|
||||
|
||||
reply = await self.get_reply(history_text)
|
||||
if reply:
|
||||
logger.info(f"💡 LLM 回复: {reply}")
|
||||
if input_center:
|
||||
perform_input_action(self.d, input_center, reply)
|
||||
time.sleep(1) # 等待发送完成
|
||||
logger.info(f"🤖 LLM 回复: {reply}")
|
||||
# 执行输入和发送
|
||||
if input_pos:
|
||||
perform_input_action(self.d, input_pos, reply)
|
||||
logger.info("✅ 回复已发送")
|
||||
# 成功发送后更新最后处理的消息哈希
|
||||
self.last_processed_msg_hash = current_msg_hash
|
||||
else:
|
||||
logger.warning("⚠️ LLM 未生成有效回复。")
|
||||
logger.warning("❌ 未找到输入框位置,无法发送回复")
|
||||
else:
|
||||
logger.warning("⚠️ LLM 未生成有效回复")
|
||||
else:
|
||||
# 消息已处理过,不重复回复
|
||||
pass
|
||||
else:
|
||||
# 最后一条是我发送的或者是系统消息,更新哈希以防之后重复处理(如果之后又变成对方发)
|
||||
# 或者简单地跳过
|
||||
if current_msg_hash != self.last_processed_msg_hash:
|
||||
logger.info(f"⚪ 最后一条消息非对方发送,无需回复: {last_msg}")
|
||||
self.last_processed_msg_hash = current_msg_hash
|
||||
|
||||
self.is_first_run = False
|
||||
|
||||
# 休眠
|
||||
# 4. 休眠
|
||||
await asyncio.sleep(CHECK_INTERVAL)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
468
WeiXin/WxUtil.py
468
WeiXin/WxUtil.py
@@ -14,11 +14,9 @@ if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import json
|
||||
from Util.VLMKit import VLMKit
|
||||
from Util.EasyOcrKit import EasyOcrKit
|
||||
|
||||
# 初始化 VLMKit 和 EasyOcrKit
|
||||
vlm_kit = VLMKit()
|
||||
# 初始化 EasyOcrKit
|
||||
ocr_kit = EasyOcrKit()
|
||||
|
||||
# 配置日志
|
||||
@@ -72,302 +70,216 @@ def connect_device():
|
||||
logger.error(f"设备连接失败: {e}")
|
||||
return None
|
||||
|
||||
async def get_vlm_json(image_path, prompt):
|
||||
"""
|
||||
通用 VLM 分析函数,返回 JSON 数据 (自动处理归一化坐标的反归一化)
|
||||
"""
|
||||
try:
|
||||
# 调用 VLM
|
||||
response = await vlm_kit.analyze_image(image_path, prompt)
|
||||
json_str = vlm_kit.extract_json(response)
|
||||
result_data = json.loads(json_str)
|
||||
|
||||
# 获取图片尺寸进行坐标反归一化
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(image_path) as img:
|
||||
width, height = img.size
|
||||
|
||||
# 定义反归一化函数
|
||||
def denormalize(point):
|
||||
if not point or len(point) != 2:
|
||||
return point
|
||||
return [int(point[0] / 1000 * width), int(point[1] / 1000 * height)]
|
||||
|
||||
# 递归遍历字典进行反归一化 (仅针对常见坐标字段 center, input_box)
|
||||
def recursive_denormalize(data):
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
if key in ["center", "input_box", "coordinates"] and isinstance(value, list) and len(value) == 2:
|
||||
data[key] = denormalize(value)
|
||||
elif isinstance(value, (dict, list)):
|
||||
recursive_denormalize(value)
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
recursive_denormalize(item)
|
||||
|
||||
recursive_denormalize(result_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"坐标反归一化失败: {e},将使用原始坐标")
|
||||
|
||||
return result_data
|
||||
except Exception as e:
|
||||
logger.error(f"VLM Analysis Failed: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
async def get_vlm_analysis(image_path):
|
||||
"""
|
||||
仅调用 VLM 分析图片,返回原始 JSON 数据 (dict)
|
||||
"""
|
||||
logger.info(f"正在使用 VLM 分析图片: {image_path}")
|
||||
|
||||
# 构造 Prompt
|
||||
prompt = """
|
||||
请分析这张微信聊天截图,提取所有对话消息。
|
||||
|
||||
【核心规则 - 优先级最高】
|
||||
1. 🚀 **从下往上扫描**:必须确保屏幕最底部的消息被识别。很多时候最底部的消息是最重要的。
|
||||
2. 🔴 **未读红点 (Unread)**:极度关注语音气泡右上角的红点。如果有红点,`is_unread` 必须为 true。
|
||||
3. 📦 **完整性**:识别图中【所有】可见的消息气泡,包括文本消息、语音消息、系统提示(如“昨天 10:36”、“你撤回了一条消息”)。
|
||||
|
||||
【消息类型判别】
|
||||
- **发送者 (Sender)**:左侧头像为“对方”(Other),右侧头像为“我”(Me)。
|
||||
- **语音 (Voice)**:
|
||||
- 气泡内只有时长(如 5")和声波图标。
|
||||
- **重点**:如果语音气泡右侧有灰色的“转文字”字样或红点,且下方没有对应的文本翻译气泡,说明它【尚未转换】。
|
||||
- `status` 判断:只有当语音气泡【正下方】紧跟着一个相同发送者的文本气泡(内容是翻译结果),`status` 才为 "converted"。否则为 "unconverted"。
|
||||
- **文本 (Text)**:气泡内包含具体的文字内容。
|
||||
|
||||
【坐标系统】
|
||||
- 使用 [0-1000] 归一化坐标。返回气泡的几何中心点 `center`。
|
||||
- 识别底部输入框的位置 `input_box`。
|
||||
|
||||
【输出格式】
|
||||
请返回纯 JSON 格式:
|
||||
{
|
||||
"is_chat_interface": true,
|
||||
"input_box": [x, y],
|
||||
"messages": [
|
||||
{
|
||||
"type": "voice" | "text" | "system",
|
||||
"sender": "对方" | "我" | "系统",
|
||||
"status": "converted" | "unconverted",
|
||||
"is_unread": true | false,
|
||||
"center": [x, y],
|
||||
"content": "消息内容或时长"
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
2. <EFBFBD> **红点 (Unread)**:极度关注语音气泡右上角的红点。如果有红点,`is_unread` 必须为 true。
|
||||
3. 📦 **完整性**:识别图中【所有】可见的消息气泡。不要遗漏任何一个,特别是连续的语音消息。
|
||||
|
||||
【消息类型判别】
|
||||
- **发送者 (Sender)**:左侧头像为“对方”(Other),右侧头像为“我”(Me)。
|
||||
- **语音 (Voice)**:气泡内只有时长(如 5")和声波图标。
|
||||
- 语音气泡右侧可能有“转文字”或“取消”等灰色小字,请忽略这些文字,气泡依然是 Voice。
|
||||
- `status` 判断:如果语音气泡下方【紧接着】有一个属于同一人的文本气泡,且内容看起来像翻译结果,则 `status` 为 "converted",否则为 "unconverted"。
|
||||
- **文本 (Text)**:气泡内包含具体的文字内容。
|
||||
|
||||
【坐标系统】
|
||||
- 使用 [0-1000] 归一化坐标。返回气泡的几何中心点 `center`。
|
||||
- 识别底部输入框的位置 `input_box`。
|
||||
|
||||
【输出格式】
|
||||
请返回纯 JSON 格式:
|
||||
{
|
||||
"is_chat_interface": true,
|
||||
"input_box": [x, y],
|
||||
"messages": [
|
||||
{
|
||||
"type": "voice",
|
||||
"sender": "对方" | "我",
|
||||
"status": "converted" | "unconverted",
|
||||
"is_unread": true | false,
|
||||
"center": [x, y],
|
||||
"content": "8\""
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
try:
|
||||
# 调用 VLM
|
||||
response = await vlm_kit.analyze_image(image_path, prompt)
|
||||
logger.info(f"VLM Raw Response: {response}") # 打印原始响应以便调试
|
||||
json_str = vlm_kit.extract_json(response)
|
||||
result_data = json.loads(json_str)
|
||||
|
||||
# 获取图片尺寸进行坐标反归一化
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(image_path) as img:
|
||||
width, height = img.size
|
||||
|
||||
# 定义反归一化函数
|
||||
def denormalize(point):
|
||||
if not point or len(point) != 2:
|
||||
return point
|
||||
return [int(point[0] / 1000 * width), int(point[1] / 1000 * height)]
|
||||
|
||||
# 反归一化 input_box
|
||||
if result_data.get("input_box"):
|
||||
result_data["input_box"] = denormalize(result_data["input_box"])
|
||||
|
||||
# 反归一化 messages
|
||||
if result_data.get("messages"):
|
||||
for msg in result_data["messages"]:
|
||||
if msg.get("center"):
|
||||
msg["center"] = denormalize(msg["center"])
|
||||
if msg.get("coordinates"): # 兼容旧字段
|
||||
msg["coordinates"] = denormalize(msg["coordinates"])
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"坐标反归一化失败: {e},将使用原始坐标")
|
||||
|
||||
return result_data
|
||||
except Exception as e:
|
||||
logger.error(f"VLM Analysis Failed: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
async def analyze_chat_image(image_path, output_path, device=None, target_name="对方"):
|
||||
"""
|
||||
使用 VLM 识别微信聊天截图中的对话内容、语音消息状态以及输入框位置
|
||||
替代原本的 CV/OCR 方案
|
||||
全面采用 CV + OCR 识别微信聊天截图中的最后一条消息
|
||||
不再使用 VLM
|
||||
"""
|
||||
|
||||
# 语音识别标志
|
||||
should_trigger_convert = False
|
||||
|
||||
try:
|
||||
result_data = await get_vlm_analysis(image_path)
|
||||
|
||||
if not result_data:
|
||||
return [], None
|
||||
|
||||
try:
|
||||
# 检查是否为聊天界面
|
||||
is_chat = result_data.get("is_chat_interface", False)
|
||||
if not is_chat:
|
||||
logger.warning("VLM 判断当前不是微信聊天界面")
|
||||
return None, None
|
||||
|
||||
if isinstance(result_data, list):
|
||||
# 兼容旧格式
|
||||
messages = result_data
|
||||
input_field_coordinates = None
|
||||
else:
|
||||
messages = result_data.get("messages", [])
|
||||
input_field_coordinates = result_data.get("input_box") # input_box
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 VLM 结果失败: {e}")
|
||||
return [], None
|
||||
|
||||
dialogue_log = []
|
||||
unconverted_voices = []
|
||||
|
||||
# 处理识别结果
|
||||
for msg in messages:
|
||||
sender = msg.get('sender', '未知')
|
||||
msg_type = msg.get('type', 'other')
|
||||
content = msg.get('content', '')
|
||||
coords = msg.get('center', [0, 0]) # center
|
||||
status = msg.get('status', 'unconverted')
|
||||
is_unread = msg.get('is_unread', False)
|
||||
is_converted = (status == "converted")
|
||||
|
||||
unread_mark = "[未读]" if is_unread else ""
|
||||
|
||||
# 记录对话日志
|
||||
if msg_type == 'voice':
|
||||
if is_converted:
|
||||
dialogue_log.append(f"{sender}: {unread_mark}[语音] {content} (已转换)")
|
||||
else:
|
||||
dialogue_log.append(f"{sender}: {unread_mark}[语音] (待转换)")
|
||||
# 将 center 转换为 coordinates 供后续使用
|
||||
msg['coordinates'] = coords
|
||||
unconverted_voices.append(msg)
|
||||
elif msg_type == 'text':
|
||||
dialogue_log.append(f"{sender}: {content}")
|
||||
|
||||
logger.info(f"VLM 识别: {sender} [{msg_type}] {content} (Converted: {is_converted}, Unread: {is_unread})")
|
||||
|
||||
# 处理未转换的语音消息
|
||||
if unconverted_voices:
|
||||
# 优先级:1. 有红点的最后一条 2. 没红点的最后一条
|
||||
unread_voices = [v for v in unconverted_voices if v.get('is_unread')]
|
||||
if unread_voices:
|
||||
logger.info(f"发现 {len(unread_voices)} 条未读语音消息,优先处理最后一条...")
|
||||
voice_to_process = unread_voices[-1]
|
||||
else:
|
||||
logger.info(f"发现 {len(unconverted_voices)} 条未转换语音消息,处理最后一条...")
|
||||
voice_to_process = unconverted_voices[-1]
|
||||
|
||||
# 仅保留选中的一条进行处理
|
||||
unconverted_voices = [voice_to_process]
|
||||
|
||||
# 使用传入的 device 或创建新连接
|
||||
# 1. 初始化
|
||||
d = device if device else connect_device()
|
||||
if not d:
|
||||
return [], None
|
||||
|
||||
for voice in unconverted_voices:
|
||||
vx, vy = voice['coordinates']
|
||||
logger.info(f"长按语音消息: ({vx}, {vy})")
|
||||
d.long_click(vx, vy, 1.5)
|
||||
time.sleep(1.0)
|
||||
# 2. 读取图片
|
||||
img = cv2.imread(image_path)
|
||||
if img is None:
|
||||
logger.error(f"无法读取图片: {image_path}")
|
||||
return [], None
|
||||
h, w = img.shape[:2]
|
||||
|
||||
# 查找“转文字” (使用 OCR)
|
||||
menu_shot_path = os.path.join(os.path.dirname(image_path), "temp_menu_shot.jpg")
|
||||
d.screenshot(menu_shot_path)
|
||||
# 3. 模板匹配寻找语音图标和红点
|
||||
audio_template = os.path.join(TEMPLATE_DIR, "audio.jpg")
|
||||
red_point_template = os.path.join(TEMPLATE_DIR, "red_point.jpg")
|
||||
|
||||
# OCR 识别
|
||||
ocr_results = ocr_kit.read_text(menu_shot_path)
|
||||
convert_btn_center = None
|
||||
audio_matches = find_all_template_matches(image_path, audio_template, threshold=0.8)
|
||||
red_points = find_all_template_matches(image_path, red_point_template, threshold=0.8)
|
||||
|
||||
for bbox, text, conf in ocr_results:
|
||||
if "转文字" in text or "转换为文字" in text:
|
||||
# bbox is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
# Calculate center
|
||||
c_x = int((bbox[0][0] + bbox[2][0]) / 2)
|
||||
c_y = int((bbox[0][1] + bbox[2][1]) / 2)
|
||||
convert_btn_center = (c_x, c_y)
|
||||
# 4. OCR 识别所有文本
|
||||
logger.info("正在执行 OCR 识别...")
|
||||
ocr_results = ocr_kit.read_text(image_path)
|
||||
|
||||
# 5. 整合所有消息
|
||||
messages = []
|
||||
debug_img = img.copy() # 初始化调试图
|
||||
|
||||
# A. 添加语音消息
|
||||
for ax, ay in audio_matches:
|
||||
# 过滤掉顶部和底部的非聊天区域 (经验值: 顶部150, 底部250)
|
||||
if ay < 150 or ay > h - 250:
|
||||
continue
|
||||
|
||||
sender = "对方" if ax < w / 2 else "我"
|
||||
is_unread = False
|
||||
for rx, ry in red_points:
|
||||
# 红点通常在语音图标右侧且 Y 轴相近
|
||||
if abs(ry - ay) < 50 and rx > ax:
|
||||
is_unread = True
|
||||
break
|
||||
|
||||
if convert_btn_center:
|
||||
logger.info(f"OCR 找到 '转文字' 按钮: {convert_btn_center}")
|
||||
d.click(convert_btn_center[0], convert_btn_center[1])
|
||||
should_trigger_convert = True
|
||||
# 根据已读/未读画框:未读红框,已读绿框
|
||||
color = (0, 0, 255) if is_unread else (0, 255, 0)
|
||||
cv2.rectangle(debug_img, (ax-30, ay-30), (ax+30, ay+30), color, 2)
|
||||
|
||||
# 动态等待: 60s语音约需10s转换,比例约 1/6
|
||||
duration_str = voice.get('content', '0').replace('"', '').strip()
|
||||
try:
|
||||
duration = int(duration_str)
|
||||
except:
|
||||
duration = 10 # 默认值
|
||||
# --- 新增:判断是否已转文字 ---
|
||||
is_converted = False
|
||||
for bbox, text, conf in ocr_results:
|
||||
c_x = int((bbox[0][0] + bbox[2][0]) / 2)
|
||||
c_y = int((bbox[0][1] + bbox[2][1]) / 2)
|
||||
# 转换后的文字通常在语音图标下方 30-300 像素内,且水平位置相近
|
||||
if 30 < c_y - ay < 300 and abs(c_x - ax) < 200:
|
||||
is_converted = True
|
||||
break
|
||||
|
||||
wait_seconds = max(2, duration / 5.0) # 稍微多等一点,用 /5.0
|
||||
logger.info(f"语音时长 {duration}s,预计等待转换 {wait_seconds:.1f}s...")
|
||||
time.sleep(wait_seconds)
|
||||
label = "YES" if is_converted else "NO"
|
||||
# 在框的右侧标注 YES 或 NO
|
||||
cv2.putText(debug_img, label, (ax + 40, ay + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
|
||||
# --- 结束 ---
|
||||
|
||||
messages.append({
|
||||
"type": "voice",
|
||||
"sender": sender,
|
||||
"center": (ax, ay),
|
||||
"y": ay,
|
||||
"is_unread": is_unread,
|
||||
"is_converted": is_converted
|
||||
})
|
||||
|
||||
# B. 添加文本消息
|
||||
# 简单策略:排除掉明显是系统时间、输入框或顶部标题的文字
|
||||
for bbox, text, conf in ocr_results:
|
||||
c_x = int((bbox[0][0] + bbox[2][0]) / 2)
|
||||
c_y = int((bbox[0][1] + bbox[2][1]) / 2)
|
||||
|
||||
# 过滤区域
|
||||
if c_y < 150 or c_y > h - 250:
|
||||
continue
|
||||
|
||||
# 过滤掉单字(可能是头像旁边的文字或杂质)和某些系统词
|
||||
if len(text) < 1 and "昨天" not in text and "今天" not in text:
|
||||
continue
|
||||
|
||||
sender = "对方" if c_x < w / 2 else "我"
|
||||
messages.append({
|
||||
"type": "text",
|
||||
"sender": sender,
|
||||
"content": text,
|
||||
"center": (c_x, c_y),
|
||||
"y": c_y
|
||||
})
|
||||
|
||||
# 6. 排序并找出最后一条消息
|
||||
if not messages:
|
||||
logger.warning("未发现任何消息")
|
||||
if output_path:
|
||||
cv2.imwrite(output_path, debug_img)
|
||||
return [], None
|
||||
|
||||
# 按 Y 坐标从上到下排序
|
||||
messages.sort(key=lambda x: x['y'])
|
||||
last_msg = messages[-1]
|
||||
|
||||
if output_path:
|
||||
cv2.imwrite(output_path, debug_img)
|
||||
logger.info(f"调试图已保存: {output_path}")
|
||||
|
||||
dialogue_log = []
|
||||
input_field_coordinates = (w // 2, int(h * 0.9)) # 默认输入框位置
|
||||
|
||||
# 7. 自动处理所有“红框 + NO”的语音消息
|
||||
unconverted_voices = [m for m in messages if m['type'] == 'voice' and m.get('is_unread') and not m.get('is_converted')]
|
||||
|
||||
if unconverted_voices:
|
||||
logger.info(f"发现 {len(unconverted_voices)} 条未转换的未读语音,开始处理...")
|
||||
|
||||
for v_msg in unconverted_voices:
|
||||
vx, vy = int(v_msg['center'][0]), int(v_msg['center'][1])
|
||||
logger.info(f"--- 正在处理语音消息 ({vx}, {vy}) ---")
|
||||
|
||||
# A. 长按语音
|
||||
logger.info(f"正在长按语音消息 ({vx}, {vy})...")
|
||||
d.long_click(vx, vy, 1.5)
|
||||
time.sleep(1.5)
|
||||
|
||||
# B. 截图寻找“转文字”按钮
|
||||
menu_shot = os.path.join(OUTPUT_DIR, f"voice_menu_{vy}.jpg")
|
||||
d.screenshot(menu_shot)
|
||||
zhuan_template = os.path.join(TEMPLATE_DIR, "zhun_wen_zi.jpg")
|
||||
|
||||
# 降低阈值到 0.7 以增加匹配成功率
|
||||
btn_pos = find_template_match(menu_shot, zhuan_template, threshold=0.7)
|
||||
|
||||
if btn_pos:
|
||||
btn_x, btn_y = int(btn_pos[0]), int(btn_pos[1])
|
||||
logger.info(f"✅ 找到'转文字'按钮: ({btn_x}, {btn_y}),点击中...")
|
||||
d.click(btn_x, btn_y)
|
||||
|
||||
# 等待转换完成 (根据语音长度,通常 3-5 秒足够)
|
||||
logger.info("等待语音转文字完成...")
|
||||
time.sleep(5.0)
|
||||
|
||||
# C. 再次截图 OCR 获取转换后的文字
|
||||
after_convert_shot = os.path.join(OUTPUT_DIR, f"after_auto_{vy}.jpg")
|
||||
d.screenshot(after_convert_shot)
|
||||
convert_ocr = ocr_kit.read_text(after_convert_shot)
|
||||
|
||||
# 提取转换文字:寻找在语音图标下方的文字块
|
||||
converted_text = ""
|
||||
for c_bbox, c_text, c_conf in convert_ocr:
|
||||
cc_x = (c_bbox[0][0] + c_bbox[2][0]) / 2
|
||||
cc_y = (c_bbox[0][1] + c_bbox[2][1]) / 2
|
||||
# 转换后的文字通常在语音图标下方 30-300 像素内,且水平位置相近
|
||||
if 30 < cc_y - vy < 300 and abs(cc_x - vx) < 250:
|
||||
converted_text = c_text
|
||||
break
|
||||
|
||||
if converted_text:
|
||||
logger.info(f"✨ OCR 识别成功!")
|
||||
print(f"\n[语音转文字结果]: {converted_text}\n")
|
||||
# 同步到消息对象
|
||||
v_msg['content'] = converted_text
|
||||
v_msg['is_converted'] = True
|
||||
# 如果这条消息也是最后一条消息,更新 dialogue_log 需要的内容
|
||||
if v_msg == last_msg:
|
||||
last_msg['content'] = converted_text
|
||||
else:
|
||||
logger.warning("OCR 未找到 '转文字' 菜单项")
|
||||
# 点击空白处关闭菜单,避免遮挡
|
||||
d.click(vx + 200, vy)
|
||||
logger.warning("❌ OCR 未能提取到转换后的文字内容")
|
||||
|
||||
if should_trigger_convert:
|
||||
# 转换完成后稍微多等一下,确保 UI 刷新
|
||||
# D. 长按并点击“取消转文字”恢复界面
|
||||
logger.info("正在恢复界面状态 (点击'取消转文字')...")
|
||||
d.long_click(vx, vy, 1.5)
|
||||
time.sleep(1.0)
|
||||
# 即使触发了转换,我们也返回当前的对话日志,但在日志末尾注明正在转换
|
||||
dialogue_log.append("系统: [正在转换语音...]")
|
||||
return dialogue_log, input_field_coordinates
|
||||
cancel_shot = os.path.join(OUTPUT_DIR, f"cancel_menu_{vy}.jpg")
|
||||
d.screenshot(cancel_shot)
|
||||
cancel_template = os.path.join(TEMPLATE_DIR, "cancel_zhuan_wen_zi.jpg")
|
||||
cancel_btn = find_template_match(cancel_shot, cancel_template, threshold=0.7)
|
||||
|
||||
if cancel_btn:
|
||||
c_btn_x, c_btn_y = int(cancel_btn[0]), int(cancel_btn[1])
|
||||
d.click(c_btn_x, c_btn_y)
|
||||
logger.info(f"✅ 已点击'取消转文字' ({c_btn_x}, {c_btn_y}),界面已恢复")
|
||||
else:
|
||||
# 兜底:点击语音图标右侧空白处尝试关闭菜单
|
||||
logger.warning("⚠️ 未找到'取消转文字'按钮,尝试点击空白处关闭菜单")
|
||||
d.click(vx + 300, vy)
|
||||
else:
|
||||
logger.warning("❌ 未能找到'转文字'按钮,可能长按失败或模板不匹配")
|
||||
# 尝试点击空白处退出菜单
|
||||
d.click(vx + 300, vy)
|
||||
|
||||
# 8. 整合对话日志 (仅针对最后一条消息进行反馈)
|
||||
dialogue_log = []
|
||||
if last_msg['type'] == 'voice':
|
||||
# 优先使用刚才转文字得到的内容
|
||||
content = last_msg.get('content') or "[语音]"
|
||||
dialogue_log.append(f"{last_msg['sender']}: {content}")
|
||||
else:
|
||||
dialogue_log.append(f"{last_msg['sender']}: {last_msg['content']}")
|
||||
|
||||
return dialogue_log, input_field_coordinates
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"VLM 分析失败: {e}", exc_info=True)
|
||||
logger.error(f"analyze_chat_image 失败: {e}", exc_info=True)
|
||||
return [], None
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Reference in New Issue
Block a user