""" 处理原始模板文档,自动添加占位符 根据占位符与字段对照表,智能识别文档类型并添加相应的占位符 使用AI大模型智能分析文档内容,识别可替换位置 """ import os import re from pathlib import Path from typing import Dict, List, Optional import json try: from docx import Document from docx.shared import Pt except ImportError: print("错误: 请先安装 python-docx: pip install python-docx") exit(1) # 尝试导入AI辅助工具 try: from template_ai_helper import TemplateAIHelper, get_available_fields_for_document HAS_AI_HELPER = True except ImportError: HAS_AI_HELPER = False print("警告: 无法导入AI辅助工具,将使用基础模式(不使用AI分析)") # 尝试导入win32com用于.doc文件转换(Windows系统) HAS_WIN32COM = False HAS_PYTHONCOM = False try: import win32com.client HAS_WIN32COM = True try: import pythoncom HAS_PYTHONCOM = True except ImportError: pass except ImportError: pass if not HAS_WIN32COM: print("="*60) print("警告: 未安装 pywin32,无法自动转换 .doc 文件") print("="*60) print("解决方案:") print(" 1. 安装 pywin32: pip install pywin32") print(" 2. 或者手动将所有 .doc 文件转换为 .docx 格式") print(" 3. 转换后重新运行此脚本") print("="*60) # 项目根目录 PROJECT_ROOT = Path(__file__).parent ORIGINAL_TEMPLATES_DIR = PROJECT_ROOT / "模板" / "原始模板" OUTPUT_TEMPLATES_DIR = PROJECT_ROOT / "模板" FIELD_MAPPING_FILE = PROJECT_ROOT / "占位符与字段对照表.md" # 文档类型映射(根据文件名识别) DOCUMENT_TYPE_MAPPING = { "请示报告卡": { "template_code": "REPORT_CARD", "fields": ["target_name", "target_organization_and_position", "report_card_request_time"], "input_fields": ["clue_info"] }, "初步核实审批表": { "template_code": "PRELIMINARY_VERIFICATION_APPROVAL", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth", "target_political_status", "target_professional_rank", "clue_source", "target_issue_description", "department_opinion", "filler_name" ], "input_fields": ["clue_info", "target_basic_info_clue"] }, "初核方案": { "template_code": "INVESTIGATION_PLAN", "fields": [ "target_name", "target_organization_and_position", "target_work_basic_info", "target_issue_description", "investigation_unit_name", "investigation_team_leader_name", "investigation_team_member_names", "investigation_location" ], "input_fields": ["clue_info", "target_basic_info_clue"] }, "谈话通知书": { "template_code": "NOTIFICATION_LETTER", "fields": [ "target_name", "target_organization_and_position", "target_id_number", "appointment_time", "appointment_location", "approval_time", "handling_department", "handler_name", "notification_time", "notification_location" ], "input_fields": ["target_basic_info_clue"] }, "谈话笔录": { "template_code": "INTERVIEW_RECORD", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "谈话询问对象情况摸底调查30问": { "template_code": "INVESTIGATION_30_QUESTIONS", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "被谈话人权利义务告知书": { "template_code": "RIGHTS_OBLIGATIONS_NOTICE", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "点对点交接单": { "template_code": "HANDOVER_FORM", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "陪送交接单": { "template_code": "ESCORT_HANDOVER_FORM", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "保密承诺书": { "template_code": "CONFIDENTIALITY_COMMITMENT", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "办案人员-办案安全保密承诺书": { "template_code": "INVESTIGATOR_CONFIDENTIALITY_COMMITMENT", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": [] }, "请示报告卡(初核报告结论)": { "template_code": "REPORT_CARD_CONCLUSION", "fields": [ "investigation_team_code", "target_name", "target_problem_description", "target_attitude" ], "input_fields": [] }, "初核情况报告": { "template_code": "INVESTIGATION_REPORT", "fields": [ "target_name", "commission_name", "target_work_basic_info", "target_issue_description", "target_problem_description", "target_organization_and_position" ], "input_fields": ["clue_info", "target_basic_info_clue"] }, "谈话审批表": { "template_code": "INTERVIEW_APPROVAL_FORM", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": ["clue_info", "target_basic_info_clue"] }, "谈话前安全风险评估表": { "template_code": "PRE_INTERVIEW_RISK_ASSESSMENT", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": ["clue_info", "target_basic_info_clue"] }, "谈话方案": { "template_code": "INTERVIEW_PLAN", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": ["clue_info", "target_basic_info_clue"] }, "谈话后安全风险评估表": { "template_code": "POST_INTERVIEW_RISK_ASSESSMENT", "fields": [ "target_name", "target_organization_and_position", "target_gender", "target_date_of_birth_full", "target_political_status", "target_address", "target_registered_address", "target_contact", "target_place_of_origin", "target_ethnicity", "target_id_number", "investigation_team_code" ], "input_fields": ["clue_info", "target_basic_info_clue"] } } # 字段名称到字段编码的映射(用于智能识别) FIELD_NAME_TO_CODE = { "被核查人姓名": "target_name", "被核查人员单位及职务": "target_organization_and_position", "被核查人员性别": "target_gender", "被核查人员出生年月": "target_date_of_birth", "被核查人员出生年月日": "target_date_of_birth_full", "被核查人员政治面貌": "target_political_status", "被核查人员职级": "target_professional_rank", "被核查人员身份证号": "target_id_number", "被核查人员身份证件及号码": "target_id_number", "被核查人员住址": "target_address", "被核查人员户籍住址": "target_registered_address", "被核查人员联系方式": "target_contact", "被核查人员籍贯": "target_place_of_origin", "被核查人员民族": "target_ethnicity", "线索来源": "clue_source", "主要问题线索": "target_issue_description", "被核查人问题描述": "target_problem_description", "被核查人员工作基本情况": "target_work_basic_info", "核查单位名称": "investigation_unit_name", "核查组组长姓名": "investigation_team_leader_name", "核查组成员姓名": "investigation_team_member_names", "核查地点": "investigation_location", "核查组代号": "investigation_team_code", "应到时间": "appointment_time", "应到地点": "appointment_location", "批准时间": "approval_time", "承办部门": "handling_department", "承办人": "handler_name", "谈话通知时间": "notification_time", "谈话通知地点": "notification_location", "请示报告卡请示时间": "report_card_request_time", "初步核实审批表承办部门意见": "department_opinion", "初步核实审批表填表人": "filler_name", "被核查人员本人认识和态度": "target_attitude", "纪委名称": "commission_name" } def convert_doc_to_docx(doc_path: Path) -> Optional[Path]: """ 将.doc文件转换为.docx格式(Windows系统使用win32com) Args: doc_path: .doc文件路径 Returns: 转换后的.docx文件路径,如果失败返回None """ if not HAS_WIN32COM: print(f" 警告: 未安装 pywin32,无法转换 {doc_path.name}") print(f" 解决方案: pip install pywin32") print(f" 或者: 请手动将 {doc_path.name} 转换为 .docx 格式") return None word = None doc = None try: # 初始化COM(如果可用) if HAS_PYTHONCOM: pythoncom.CoInitialize() word = win32com.client.Dispatch("Word.Application") word.Visible = False word.DisplayAlerts = 0 # 不显示警告 docx_path = doc_path.with_suffix('.docx') # 检查源文件是否存在 if not doc_path.exists(): print(f" ✗ 错误: 源文件不存在: {doc_path}") if word: word.Quit() return None # 打开.doc文件(使用绝对路径) abs_doc_path = str(doc_path.absolute()) abs_docx_path = str(docx_path.absolute()) print(f" 正在转换...") print(f" 源: {doc_path.name}") print(f" 目标: {docx_path.name}") # 打开文档 doc = word.Documents.Open( abs_doc_path, ReadOnly=True, ConfirmConversions=False, AddToRecentFiles=False ) # 另存为.docx格式 (16 = wdFormatXMLDocument) doc.SaveAs2( abs_docx_path, FileFormat=16 # wdFormatXMLDocument ) # 关闭文档 doc.Close(False) # False表示不保存更改 doc = None # 退出Word word.Quit() word = None # 检查转换后的文件是否存在 if docx_path.exists() and docx_path.stat().st_size > 0: file_size = docx_path.stat().st_size print(f" ✓ 转换成功 ({file_size} 字节)") return docx_path else: print(f" ✗ 转换失败: 目标文件不存在或为空") return None except Exception as e: error_msg = str(e) error_type = type(e).__name__ print(f" ✗ 转换失败: {error_type}: {error_msg}") # 清理资源 try: if doc: doc.Close(False) except: pass try: if word: word.Quit() except: pass # 提供更详细的错误信息和解决方案 print(f" 诊断信息:") if "Word.Application" in error_msg or "COM" in error_msg or "CreateObject" in error_msg: print(f" - 可能原因: Microsoft Word 未安装或无法访问") print(f" - 解决方案:") print(f" 1. 确保已安装 Microsoft Word(不是 WPS)") print(f" 2. 手动将 .doc 文件转换为 .docx 格式") print(f" 3. 使用 Word 打开文件,另存为 .docx 格式") elif "pywin32" in error_msg.lower() or "win32com" in error_msg.lower(): print(f" - 解决方案: pip install pywin32") elif "权限" in error_msg or "Permission" in error_msg: print(f" - 可能原因: 文件被其他程序占用或权限不足") print(f" - 解决方案: 关闭文件,检查文件权限") else: print(f" - 请检查错误信息并手动转换文件") return None finally: # 清理COM if HAS_PYTHONCOM: try: pythoncom.CoUninitialize() except: pass def identify_document_type(file_name: str) -> Optional[Dict]: """ 根据文件名识别文档类型 Args: file_name: 文件名 Returns: 文档类型配置,如果无法识别返回None """ # 移除扩展名和常见后缀 base_name = Path(file_name).stem base_name = base_name.replace("(XXX)", "").replace("(XXX)", "").replace("XXX", "") base_name = base_name.strip() # 尝试匹配文档类型 for doc_type, config in DOCUMENT_TYPE_MAPPING.items(): if doc_type in base_name: return config # 如果无法精确匹配,尝试部分匹配 for doc_type, config in DOCUMENT_TYPE_MAPPING.items(): if any(keyword in base_name for keyword in doc_type.split()): return config return None def find_placeholder_positions(text: str, field_name: str, field_code: str) -> List[tuple]: """ 在文本中查找可能需要替换为占位符的位置 Args: text: 文本内容 field_name: 字段名称 field_code: 字段编码 Returns: 找到的位置列表 (start, end, replacement_text) """ positions = [] # 查找字段名称后的内容 pattern = rf"{re.escape(field_name)}[::]\s*([^\n\r]+)" matches = re.finditer(pattern, text) for match in matches: value = match.group(1).strip() # 如果值不是占位符格式,且不是空值,则可能需要替换 if value and not value.startswith("{{"): # 跳过常见的示例值 if value not in ["XXX", "xxx", "-", "——", "——", "待填", "待填写"]: positions.append(( match.start(1), match.end(1), f"{{{{{field_code}}}}}" )) return positions def replace_text_in_runs(runs, old_text: str, new_text: str) -> bool: """ 在runs中替换文本 Args: runs: 文本runs列表 old_text: 要替换的旧文本 new_text: 新文本 Returns: 是否进行了替换 """ full_text = ''.join(run.text for run in runs) if old_text not in full_text: return False # 找到包含旧文本的runs current_pos = 0 for run in runs: run_start = current_pos run_end = current_pos + len(run.text) if run_start <= full_text.find(old_text) < run_end: # 在这个run中替换 run.text = run.text.replace(old_text, new_text) return True current_pos = run_end return False def apply_ai_replacements(text: str, ai_replacements: List[Dict]) -> str: """ 应用AI识别的替换建议 Args: text: 原始文本 ai_replacements: AI识别的替换建议列表 Returns: 替换后的文本 """ result_text = text # 按置信度排序,优先处理高置信度的替换 sorted_replacements = sorted(ai_replacements, key=lambda x: x.get('confidence', 0), reverse=True) for replacement in sorted_replacements: original = replacement.get('original_text', '') replacement_text = replacement.get('replacement', '') confidence = replacement.get('confidence', 0) # 只应用置信度大于0.7的替换 if confidence > 0.7 and original and replacement_text: # 转义特殊字符 escaped_original = re.escape(original) # 替换(只替换第一次出现,避免重复替换) if escaped_original in result_text: result_text = result_text.replace(original, replacement_text, 1) return result_text def process_document(input_path: Path, output_path: Path, doc_config: Dict, use_ai: bool = True) -> bool: """ 处理单个文档,添加占位符 Args: input_path: 输入文件路径 output_path: 输出文件路径 doc_config: 文档配置 use_ai: 是否使用AI分析(默认True) Returns: 是否处理成功 """ try: # 如果是.doc文件,先转换为.docx if input_path.suffix.lower() == '.doc': print(f" 转换 .doc 到 .docx: {input_path.name}") docx_path = convert_doc_to_docx(input_path) if not docx_path or not docx_path.exists(): print(f" ⚠ 跳过: 无法转换 {input_path.name}") return False input_path = docx_path # 初始化AI助手(如果可用) ai_helper = None available_fields = [] if use_ai and HAS_AI_HELPER: try: ai_helper = TemplateAIHelper() available_fields = get_available_fields_for_document(doc_config, FIELD_NAME_TO_CODE) print(f" ✓ AI分析已启用") except Exception as e: print(f" ⚠ AI分析不可用: {e},将使用基础模式") ai_helper = None # 打开文档 doc = Document(str(input_path)) # 统计替换次数 replacement_count = 0 ai_replacement_count = 0 # 处理段落中的占位符 for para_idx, paragraph in enumerate(doc.paragraphs): if not paragraph.text: continue text = paragraph.text original_text = text # 首先使用AI分析(如果可用) if ai_helper and available_fields: try: doc_type = doc_config.get('template_code', '未知') ai_replacements = ai_helper.analyze_paragraph( text, available_fields, doc_type ) if ai_replacements: # 应用AI识别的替换 text = apply_ai_replacements(text, ai_replacements) if text != original_text: ai_replacement_count += len(ai_replacements) except Exception as e: print(f" ⚠ 段落 {para_idx+1} AI分析失败: {e}") # 然后使用规则匹配(作为补充) for field_code in doc_config.get('fields', []): # 查找字段名称 for field_name, code in FIELD_NAME_TO_CODE.items(): if code == field_code: # 模式1: 字段名称: XXX 或 字段名称: 具体值 pattern1 = rf"({re.escape(field_name)}[::]\s*)([^\n\r{{]+?)(\s|$|\n|\r|,|。)" def replace_func1(match): value = match.group(2).strip() # 如果值不是占位符格式,且不是空值,则替换 if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]: return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}" return match.group(0) text = re.sub(pattern1, replace_func1, text) # 模式2: 直接替换常见的占位符(XXX) pattern2 = rf"({re.escape(field_name)}[::]\s*)(XXX|xxx|待填|待填写)" text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text) break if text != original_text: # 替换整个段落文本 paragraph.clear() paragraph.add_run(text) replacement_count += 1 # 处理表格中的占位符 for table_idx, table in enumerate(doc.tables): for row_idx, row in enumerate(table.rows): for col_idx, cell in enumerate(row.cells): for paragraph in cell.paragraphs: if not paragraph.text: continue text = paragraph.text original_text = text # 首先使用AI分析(如果可用) if ai_helper and available_fields: try: doc_type = doc_config.get('template_code', '未知') ai_replacements = ai_helper.analyze_table_cell( text, available_fields, doc_type, row_idx, col_idx ) if ai_replacements: # 应用AI识别的替换 text = apply_ai_replacements(text, ai_replacements) if text != original_text: ai_replacement_count += len(ai_replacements) except Exception as e: pass # 静默失败,继续使用规则匹配 # 然后使用规则匹配(作为补充) for field_code in doc_config.get('fields', []): for field_name, code in FIELD_NAME_TO_CODE.items(): if code == field_code: # 模式1: 字段名称: XXX 或 字段名称: 具体值 pattern1 = rf"({re.escape(field_name)}[::]\s*)([^\n\r{{]+?)(\s|$|\n|\r|,|。)" def replace_func1(match): value = match.group(2).strip() if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]: return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}" return match.group(0) text = re.sub(pattern1, replace_func1, text) # 模式2: 直接替换常见的占位符(XXX) pattern2 = rf"({re.escape(field_name)}[::]\s*)(XXX|xxx|待填|待填写)" text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text) break if text != original_text: paragraph.clear() paragraph.add_run(text) replacement_count += 1 # 确保输出目录存在 output_path.parent.mkdir(parents=True, exist_ok=True) # 保存文档 doc.save(str(output_path)) # 输出统计信息 if replacement_count > 0 or ai_replacement_count > 0: msg = f" ✓ 处理成功" if ai_replacement_count > 0: msg += f",AI识别 {ai_replacement_count} 处" if replacement_count > 0: msg += f",规则匹配 {replacement_count} 处" print(msg) else: print(f" ⚠ 处理完成,但未找到需要替换的内容(可能已包含占位符)") return True except Exception as e: print(f" ✗ 处理失败: {e}") import traceback traceback.print_exc() return False def process_all_templates(): """ 处理所有原始模板文件 """ print("="*80) print("开始处理原始模板文档") print("="*80) if not ORIGINAL_TEMPLATES_DIR.exists(): print(f"错误: 原始模板目录不存在: {ORIGINAL_TEMPLATES_DIR}") return # 统计信息 processed_count = 0 skipped_count = 0 failed_count = 0 # 遍历所有文件 for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR): for file in files: # 只处理.doc和.docx文件 if not file.endswith(('.doc', '.docx')): continue input_path = Path(root) / file # 识别文档类型 doc_config = identify_document_type(file) if not doc_config: print(f"\n⚠ 无法识别文档类型: {file}") print(f" 路径: {input_path}") skipped_count += 1 continue # 生成输出路径(保持相对目录结构) relative_path = input_path.relative_to(ORIGINAL_TEMPLATES_DIR) output_path = OUTPUT_TEMPLATES_DIR / relative_path.parent / f"{Path(file).stem}.docx" print(f"\n处理: {file}") print(f" 类型: {doc_config.get('template_code', 'UNKNOWN')}") print(f" 输出: {output_path}") # 处理文档(使用AI分析) if process_document(input_path, output_path, doc_config, use_ai=True): processed_count += 1 else: failed_count += 1 # 输出统计信息 print("\n" + "="*80) print("处理完成") print("="*80) print(f"成功处理: {processed_count} 个文件") print(f"跳过: {skipped_count} 个文件") print(f"失败: {failed_count} 个文件") print(f"\n处理后的模板保存在: {OUTPUT_TEMPLATES_DIR}") print("\n请检查生成的模板文件,确认占位符是否正确添加。") print("如有需要,请手动调整占位符位置。") if __name__ == '__main__': process_all_templates()