增加占位符替换测试脚本

2025-12-15 14:45:42 +08:00 · 2025-12-15 14:45:42 +08:00 · 557c9ae351
commit 557c9ae351
parent cb4a07f148
6 changed files with 433 additions and 299470 deletions
--- a/backups/backup_finyx_20251209_170604.zip
+++ b/backups/backup_finyx_20251209_170604.zip
--- a/backups/backup_finyx_20251212_074837.sql
+++ b/backups/backup_finyx_20251212_074837.sql
--- a/backups/backup_target_db_20251212_092809.sql
+++ b/backups/backup_target_db_20251212_092809.sql
--- a/backups/backup_target_db_20251212_093051.sql
+++ b/backups/backup_target_db_20251212_093051.sql
--- a/test_template_placeholder_replacement.py
+++ b/test_template_placeholder_replacement.py
@ -0,0 +1,333 @@
 """
 测试模板占位符识别和替换功能
 读取模板文件，识别占位符，用虚拟数据替换并生成新文档
 """
 import os
 import re
 from pathlib import Path
 from docx import Document
 from services.document_service import DocumentService
 def extract_placeholders_from_docx(file_path: str) -> set:
    """
    从docx文件中提取所有占位符
    Args:
        file_path: docx文件路径
    Returns:
        占位符集合，格式: {'field_code1', 'field_code2', ...}
    """
    placeholders = set()
    pattern = r'\{\{([^}]+)\}\}'  # 匹配 {{field_code}} 格式
    try:
        doc = Document(file_path)
        # 从段落中提取占位符
        for paragraph in doc.paragraphs:
            # 获取段落的所有文本（包括 run 中的文本）
            text = ''.join([run.text for run in paragraph.runs])
            if not text:
                text = paragraph.text
            matches = re.findall(pattern, text)
            for match in matches:
                cleaned = match.strip()
                # 过滤掉不完整的占位符（包含 { 或 } 的）
                if cleaned and '{' not in cleaned and '}' not in cleaned:
                    placeholders.add(cleaned)
        # 从表格中提取占位符
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    # 获取单元格的所有文本（包括 run 中的文本）
                    cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
                    if not cell_text:
                        cell_text = cell.text
                    matches = re.findall(pattern, cell_text)
                    for match in matches:
                        cleaned = match.strip()
                        # 过滤掉不完整的占位符（包含 { 或 } 的）
                        if cleaned and '{' not in cleaned and '}' not in cleaned:
                            placeholders.add(cleaned)
    except Exception as e:
        print(f"  [错误] 读取文件失败 - {str(e)}")
        import traceback
        traceback.print_exc()
    return placeholders
 def generate_mock_data(placeholders: set) -> dict:
    """
    根据占位符生成虚拟数据
    Args:
        placeholders: 占位符集合
    Returns:
        虚拟数据字典
    """
    # 常见字段的虚拟数据映射
    mock_data_map = {
        # 基本信息
        'target_name': '张三',
        'target_organization_and_position': '某市某区某局副局长',
        'target_gender': '男',
        'target_date_of_birth': '198005',
        'target_date_of_birth_full': '1980年05月15日',
        'target_political_status': '中共党员',
        'target_professional_rank': '副处级',
        'target_id_number': '110101198005151234',
        'target_address': '某市某区某街道某小区1号楼101室',
        'target_registered_address': '某市某区某街道某小区1号楼101室',
        'target_contact': '13800138000',
        'target_place_of_origin': '某省某市',
        'target_ethnicity': '汉族',
        # 问题线索
        'clue_source': '群众举报',
        'target_issue_description': '涉嫌违反工作纪律，在项目审批过程中存在不当行为',
        'target_problem_description': '在项目审批过程中，未严格按照规定程序执行，存在程序不规范问题',
        'target_work_basic_info': '2005年参加工作，先后在某局多个科室工作，2018年任现职',
        'target_attitude': '被核查人表示认识到问题的严重性，愿意积极配合调查',
        # 审批信息
        'filler_name': '李四',
        'department_opinion': '同意开展初步核实',
        'approval_time': '2024年12月15日',
        'report_card_request_time': '2024年12月15日',
        'handling_department': '某市纪委监委第一监督检查室',
        'handler_name': '王五',
        # 谈话相关
        'appointment_time': '2024年12月20日上午9:00',
        'appointment_location': '某市纪委监委谈话室',
        'notification_time': '2024年12月18日',
        'notification_location': '某市纪委监委',
        # 核查信息
        'investigation_unit_name': '某市纪委监委',
        'investigation_team_leader_name': '赵六',
        'investigation_team_member_names': '赵六、孙七、周八',
        'investigation_location': '某市纪委监委',
        'investigation_team_code': '2024-001',
        # 其他
        'commission_name': '某市纪律检查委员会',
        'backup_personnel': '钱九',
        'assessment_opinion': '风险评估为低风险，可以开展谈话',
    }
    # 为所有占位符生成数据
    field_data = {}
    for placeholder in placeholders:
        if placeholder in mock_data_map:
            field_data[placeholder] = mock_data_map[placeholder]
        else:
            # 如果占位符不在映射中，生成默认值
            field_data[placeholder] = f'[虚拟数据-{placeholder}]'
    return field_data
 def test_template_replacement(template_path: str, output_dir: str):
    """
    测试模板占位符识别和替换
    Args:
        template_path: 模板文件路径
        output_dir: 输出目录
    """
    print(f"\n{'='*80}")
    print(f"测试模板: {template_path}")
    print(f"{'='*80}")
    # 检查模板文件是否存在
    if not os.path.exists(template_path):
        print(f"  [错误] 模板文件不存在: {template_path}")
        return False
    # 提取占位符
    print(f"\n[步骤1] 提取占位符...")
    placeholders = extract_placeholders_from_docx(template_path)
    print(f"  发现 {len(placeholders)} 个不同的占位符:")
    for placeholder in sorted(placeholders):
        print(f"    - {{{{ {placeholder} }}}}")
    if not placeholders:
        print(f"  [警告] 未发现任何占位符，模板可能不需要填充数据")
        return False
    # 生成虚拟数据
    print(f"\n[步骤2] 生成虚拟数据...")
    field_data = generate_mock_data(placeholders)
    print(f"  生成了 {len(field_data)} 个字段的虚拟数据")
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    # 使用DocumentService填充模板
    print(f"\n[步骤3] 填充模板...")
    try:
        # 注意：DocumentService需要环境变量配置，但我们可以直接使用fill_template方法
        # 为了测试，我们创建一个简化的填充方法
        doc = Document(template_path)
        placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
        # 替换段落中的占位符
        for paragraph in doc.paragraphs:
            full_text = paragraph.text
            if not full_text:
                continue
            # 检查是否有占位符
            has_placeholder = False
            for field_code in field_data.keys():
                placeholder = f"{{{{{field_code}}}}}"
                if placeholder in full_text:
                    has_placeholder = True
                    break
            if has_placeholder:
                # 执行替换
                final_text = full_text
                for field_code, field_value in field_data.items():
                    placeholder = f"{{{{{field_code}}}}}"
                    replacement_value = str(field_value) if field_value else ''
                    final_text = final_text.replace(placeholder, replacement_value)
                # 替换段落文本（保持格式）
                if len(paragraph.runs) == 1:
                    paragraph.runs[0].text = final_text
                else:
                    # 多个run的情况：合并为一个run
                    for run in paragraph.runs[1:]:
                        run.text = ''
                    if paragraph.runs:
                        paragraph.runs[0].text = final_text
        # 替换表格中的占位符
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        full_text = paragraph.text
                        if not full_text:
                            continue
                        # 检查是否有占位符
                        has_placeholder = False
                        for field_code in field_data.keys():
                            placeholder = f"{{{{{field_code}}}}}"
                            if placeholder in full_text:
                                has_placeholder = True
                                break
                        if has_placeholder:
                            # 执行替换
                            final_text = full_text
                            for field_code, field_value in field_data.items():
                                placeholder = f"{{{{{field_code}}}}}"
                                replacement_value = str(field_value) if field_value else ''
                                final_text = final_text.replace(placeholder, replacement_value)
                            # 替换段落文本
                            if len(paragraph.runs) == 1:
                                paragraph.runs[0].text = final_text
                            else:
                                for run in paragraph.runs[1:]:
                                    run.text = ''
                                if paragraph.runs:
                                    paragraph.runs[0].text = final_text
        # 生成输出文件名
        template_name = Path(template_path).stem
        output_file = os.path.join(output_dir, f"{template_name}_已填充.docx")
        # 保存文档
        doc.save(output_file)
        print(f"  [成功] 文档已保存到: {output_file}")
        # 验证替换结果
        print(f"\n[步骤4] 验证替换结果...")
        verify_doc = Document(output_file)
        remaining_placeholders = set()
        # 检查段落
        for paragraph in verify_doc.paragraphs:
            text = paragraph.text
            matches = placeholder_pattern.findall(text)
            for match in matches:
                cleaned = match.strip()
                if cleaned and '{' not in cleaned and '}' not in cleaned:
                    remaining_placeholders.add(cleaned)
        # 检查表格
        for table in verify_doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        text = paragraph.text
                        matches = placeholder_pattern.findall(text)
                        for match in matches:
                            cleaned = match.strip()
                            if cleaned and '{' not in cleaned and '}' not in cleaned:
                                remaining_placeholders.add(cleaned)
        if remaining_placeholders:
            print(f"  [警告] 仍有 {len(remaining_placeholders)} 个占位符未替换:")
            for placeholder in sorted(remaining_placeholders):
                print(f"    - {{{{ {placeholder} }}}}")
        else:
            print(f"  [成功] 所有占位符已成功替换！")
        return True
    except Exception as e:
        print(f"  [错误] 填充模板失败: {str(e)}")
        import traceback
        traceback.print_exc()
        return False
 def main():
    """主函数"""
    # 项目根目录
    project_root = Path(__file__).parent
    # 模板文件路径
    template1_path = project_root / "template_finish" / "2-初核模版" / "2.谈话审批" / "走读式谈话审批" / "2谈话审批表.docx"
    template2_path = project_root / "template_finish" / "2-初核模版" / "3.初核结论" / "8-1请示报告卡（初核报告结论） .docx"
    # 输出目录
    output_dir = project_root / "output_temp"
    print("="*80)
    print("模板占位符识别和替换测试")
    print("="*80)
    # 测试第一个模板
    success1 = test_template_replacement(str(template1_path), str(output_dir))
    # 测试第二个模板
    success2 = test_template_replacement(str(template2_path), str(output_dir))
    # 总结
    print(f"\n{'='*80}")
    print("测试总结")
    print(f"{'='*80}")
    print(f"模板1 (2谈话审批表.docx): {'[成功]' if success1 else '[失败]'}")
    print(f"模板2 (8-1请示报告卡（初核报告结论）.docx): {'[成功]' if success2 else '[失败]'}")
    print(f"\n输出目录: {output_dir}")
    print(f"{'='*80}\n")
 if __name__ == "__main__":
    main()
--- a/verify_generated_files.py
+++ b/verify_generated_files.py
@ -0,0 +1,100 @@
 """
 验证生成的Word文档中是否还有未替换的占位符
 """
 import re
 from pathlib import Path
 from docx import Document
 def check_placeholders_in_docx(file_path: str):
    """
    检查docx文件中是否还有占位符
    Args:
        file_path: docx文件路径
    """
    placeholders = set()
    pattern = r'\{\{([^}]+)\}\}'  # 匹配 {{field_code}} 格式
    try:
        doc = Document(file_path)
        # 从段落中提取占位符
        for paragraph in doc.paragraphs:
            text = ''.join([run.text for run in paragraph.runs])
            if not text:
                text = paragraph.text
            matches = re.findall(pattern, text)
            for match in matches:
                cleaned = match.strip()
                if cleaned and '{' not in cleaned and '}' not in cleaned:
                    placeholders.add(cleaned)
        # 从表格中提取占位符
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
                    if not cell_text:
                        cell_text = cell.text
                    matches = re.findall(pattern, cell_text)
                    for match in matches:
                        cleaned = match.strip()
                        if cleaned and '{' not in cleaned and '}' not in cleaned:
                            placeholders.add(cleaned)
        return placeholders
    except Exception as e:
        print(f"  错误: 读取文件失败 - {str(e)}")
        return None
 def main():
    """主函数"""
    project_root = Path(__file__).parent
    output_dir = project_root / "output_temp"
    print("="*80)
    print("验证生成的Word文档")
    print("="*80)
    # 检查两个生成的文件
    files_to_check = [
        output_dir / "2谈话审批表_已填充.docx",
        output_dir / "8-1请示报告卡（初核报告结论） _已填充.docx"
    ]
    all_success = True
    for file_path in files_to_check:
        print(f"\n检查文件: {file_path.name}")
        if not file_path.exists():
            print(f"  [错误] 文件不存在")
            all_success = False
            continue
        placeholders = check_placeholders_in_docx(str(file_path))
        if placeholders is None:
            print(f"  [错误] 无法读取文件")
            all_success = False
        elif placeholders:
            print(f"  [警告] 发现 {len(placeholders)} 个未替换的占位符:")
            for placeholder in sorted(placeholders):
                print(f"    - {{{{ {placeholder} }}}}")
            all_success = False
        else:
            print(f"  [成功] 所有占位符已成功替换，文档可以正常使用")
    print(f"\n{'='*80}")
    if all_success:
        print("验证结果: 所有文件验证通过！")
    else:
        print("验证结果: 部分文件存在问题，请检查")
    print(f"{'='*80}\n")
 if __name__ == "__main__":
    main()