""" 测试模板占位符识别和替换功能 读取模板文件,识别占位符,用虚拟数据替换并生成新文档 """ import os import re from pathlib import Path from docx import Document from services.document_service import DocumentService def extract_placeholders_from_docx(file_path: str) -> set: """ 从docx文件中提取所有占位符 Args: file_path: docx文件路径 Returns: 占位符集合,格式: {'field_code1', 'field_code2', ...} """ placeholders = set() pattern = r'\{\{([^}]+)\}\}' # 匹配 {{field_code}} 格式 try: doc = Document(file_path) # 从段落中提取占位符 for paragraph in doc.paragraphs: # 获取段落的所有文本(包括 run 中的文本) text = ''.join([run.text for run in paragraph.runs]) if not text: text = paragraph.text matches = re.findall(pattern, text) for match in matches: cleaned = match.strip() # 过滤掉不完整的占位符(包含 { 或 } 的) if cleaned and '{' not in cleaned and '}' not in cleaned: placeholders.add(cleaned) # 从表格中提取占位符 for table in doc.tables: for row in table.rows: for cell in row.cells: # 获取单元格的所有文本(包括 run 中的文本) cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs]) if not cell_text: cell_text = cell.text matches = re.findall(pattern, cell_text) for match in matches: cleaned = match.strip() # 过滤掉不完整的占位符(包含 { 或 } 的) if cleaned and '{' not in cleaned and '}' not in cleaned: placeholders.add(cleaned) except Exception as e: print(f" [错误] 读取文件失败 - {str(e)}") import traceback traceback.print_exc() return placeholders def generate_mock_data(placeholders: set) -> dict: """ 根据占位符生成虚拟数据 Args: placeholders: 占位符集合 Returns: 虚拟数据字典 """ # 常见字段的虚拟数据映射 mock_data_map = { # 基本信息 'target_name': '张三', 'target_organization_and_position': '某市某区某局副局长', 'target_gender': '男', 'target_date_of_birth': '198005', 'target_date_of_birth_full': '1980年05月15日', 'target_political_status': '中共党员', 'target_professional_rank': '副处级', 'target_id_number': '110101198005151234', 'target_address': '某市某区某街道某小区1号楼101室', 'target_registered_address': '某市某区某街道某小区1号楼101室', 'target_contact': '13800138000', 'target_place_of_origin': '某省某市', 'target_ethnicity': '汉族', # 问题线索 'clue_source': '群众举报', 'target_issue_description': '涉嫌违反工作纪律,在项目审批过程中存在不当行为', 'target_problem_description': '在项目审批过程中,未严格按照规定程序执行,存在程序不规范问题', 'target_work_basic_info': '2005年参加工作,先后在某局多个科室工作,2018年任现职', 'target_attitude': '被核查人表示认识到问题的严重性,愿意积极配合调查', # 审批信息 'filler_name': '李四', 'department_opinion': '同意开展初步核实', 'approval_time': '2024年12月15日', 'report_card_request_time': '2024年12月15日', 'handling_department': '某市纪委监委第一监督检查室', 'handler_name': '王五', # 谈话相关 'appointment_time': '2024年12月20日上午9:00', 'appointment_location': '某市纪委监委谈话室', 'notification_time': '2024年12月18日', 'notification_location': '某市纪委监委', # 核查信息 'investigation_unit_name': '某市纪委监委', 'investigation_team_leader_name': '赵六', 'investigation_team_member_names': '赵六、孙七、周八', 'investigation_location': '某市纪委监委', 'investigation_team_code': '2024-001', # 其他 'commission_name': '某市纪律检查委员会', 'backup_personnel': '钱九', 'assessment_opinion': '风险评估为低风险,可以开展谈话', } # 为所有占位符生成数据 field_data = {} for placeholder in placeholders: if placeholder in mock_data_map: field_data[placeholder] = mock_data_map[placeholder] else: # 如果占位符不在映射中,生成默认值 field_data[placeholder] = f'[虚拟数据-{placeholder}]' return field_data def test_template_replacement(template_path: str, output_dir: str): """ 测试模板占位符识别和替换 Args: template_path: 模板文件路径 output_dir: 输出目录 """ print(f"\n{'='*80}") print(f"测试模板: {template_path}") print(f"{'='*80}") # 检查模板文件是否存在 if not os.path.exists(template_path): print(f" [错误] 模板文件不存在: {template_path}") return False # 提取占位符 print(f"\n[步骤1] 提取占位符...") placeholders = extract_placeholders_from_docx(template_path) print(f" 发现 {len(placeholders)} 个不同的占位符:") for placeholder in sorted(placeholders): print(f" - {{{{ {placeholder} }}}}") if not placeholders: print(f" [警告] 未发现任何占位符,模板可能不需要填充数据") return False # 生成虚拟数据 print(f"\n[步骤2] 生成虚拟数据...") field_data = generate_mock_data(placeholders) print(f" 生成了 {len(field_data)} 个字段的虚拟数据") # 创建输出目录 os.makedirs(output_dir, exist_ok=True) # 使用DocumentService填充模板 print(f"\n[步骤3] 填充模板...") try: # 注意:DocumentService需要环境变量配置,但我们可以直接使用fill_template方法 # 为了测试,我们创建一个简化的填充方法 doc = Document(template_path) placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}') # 替换段落中的占位符 for paragraph in doc.paragraphs: full_text = paragraph.text if not full_text: continue # 检查是否有占位符 has_placeholder = False for field_code in field_data.keys(): placeholder = f"{{{{{field_code}}}}}" if placeholder in full_text: has_placeholder = True break if has_placeholder: # 执行替换 final_text = full_text for field_code, field_value in field_data.items(): placeholder = f"{{{{{field_code}}}}}" replacement_value = str(field_value) if field_value else '' final_text = final_text.replace(placeholder, replacement_value) # 替换段落文本(保持格式) if len(paragraph.runs) == 1: paragraph.runs[0].text = final_text else: # 多个run的情况:合并为一个run for run in paragraph.runs[1:]: run.text = '' if paragraph.runs: paragraph.runs[0].text = final_text # 替换表格中的占位符 for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: full_text = paragraph.text if not full_text: continue # 检查是否有占位符 has_placeholder = False for field_code in field_data.keys(): placeholder = f"{{{{{field_code}}}}}" if placeholder in full_text: has_placeholder = True break if has_placeholder: # 执行替换 final_text = full_text for field_code, field_value in field_data.items(): placeholder = f"{{{{{field_code}}}}}" replacement_value = str(field_value) if field_value else '' final_text = final_text.replace(placeholder, replacement_value) # 替换段落文本 if len(paragraph.runs) == 1: paragraph.runs[0].text = final_text else: for run in paragraph.runs[1:]: run.text = '' if paragraph.runs: paragraph.runs[0].text = final_text # 生成输出文件名 template_name = Path(template_path).stem output_file = os.path.join(output_dir, f"{template_name}_已填充.docx") # 保存文档 doc.save(output_file) print(f" [成功] 文档已保存到: {output_file}") # 验证替换结果 print(f"\n[步骤4] 验证替换结果...") verify_doc = Document(output_file) remaining_placeholders = set() # 检查段落 for paragraph in verify_doc.paragraphs: text = paragraph.text matches = placeholder_pattern.findall(text) for match in matches: cleaned = match.strip() if cleaned and '{' not in cleaned and '}' not in cleaned: remaining_placeholders.add(cleaned) # 检查表格 for table in verify_doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: text = paragraph.text matches = placeholder_pattern.findall(text) for match in matches: cleaned = match.strip() if cleaned and '{' not in cleaned and '}' not in cleaned: remaining_placeholders.add(cleaned) if remaining_placeholders: print(f" [警告] 仍有 {len(remaining_placeholders)} 个占位符未替换:") for placeholder in sorted(remaining_placeholders): print(f" - {{{{ {placeholder} }}}}") else: print(f" [成功] 所有占位符已成功替换!") return True except Exception as e: print(f" [错误] 填充模板失败: {str(e)}") import traceback traceback.print_exc() return False def main(): """主函数""" # 项目根目录 project_root = Path(__file__).parent # 模板文件路径 template1_path = project_root / "template_finish" / "2-初核模版" / "2.谈话审批" / "走读式谈话审批" / "2谈话审批表.docx" template2_path = project_root / "template_finish" / "2-初核模版" / "3.初核结论" / "8-1请示报告卡(初核报告结论) .docx" # 输出目录 output_dir = project_root / "output_temp" print("="*80) print("模板占位符识别和替换测试") print("="*80) # 测试第一个模板 success1 = test_template_replacement(str(template1_path), str(output_dir)) # 测试第二个模板 success2 = test_template_replacement(str(template2_path), str(output_dir)) # 总结 print(f"\n{'='*80}") print("测试总结") print(f"{'='*80}") print(f"模板1 (2谈话审批表.docx): {'[成功]' if success1 else '[失败]'}") print(f"模板2 (8-1请示报告卡(初核报告结论).docx): {'[成功]' if success2 else '[失败]'}") print(f"\n输出目录: {output_dir}") print(f"{'='*80}\n") if __name__ == "__main__": main()