ai-business-write/test_template_placeholder_replacement.py

"""
测试模板占位符识别和替换功能
读取模板文件，识别占位符，用虚拟数据替换并生成新文档
"""
import os
import re
from pathlib import Path
from docx import Document
from services.document_service import DocumentService


def extract_placeholders_from_docx(file_path: str) -> set:
    """
    从docx文件中提取所有占位符

    Args:
        file_path: docx文件路径

    Returns:
        占位符集合，格式: {'field_code1', 'field_code2', ...}
    """
    placeholders = set()
    pattern = r'\{\{([^}]+)\}\}'  # 匹配 {{field_code}} 格式

    try:
        doc = Document(file_path)

        # 从段落中提取占位符
        for paragraph in doc.paragraphs:
            # 获取段落的所有文本（包括 run 中的文本）
            text = ''.join([run.text for run in paragraph.runs])
            if not text:
                text = paragraph.text

            matches = re.findall(pattern, text)
            for match in matches:
                # 清理占位符：去除首尾空格，并将中间的空格/换行符替换为下划线
                cleaned = match.strip()
                # 将中间的空格、换行符、制表符等空白字符替换为下划线
                cleaned = re.sub(r'\s+', '_', cleaned)
                # 过滤掉不完整的占位符（包含 { 或 } 的）
                if cleaned and '{' not in cleaned and '}' not in cleaned:
                    placeholders.add(cleaned)

        # 从表格中提取占位符
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    # 获取单元格的所有文本（包括 run 中的文本）
                    cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
                    if not cell_text:
                        cell_text = cell.text

                    matches = re.findall(pattern, cell_text)
                    for match in matches:
                        # 清理占位符：去除首尾空格，并将中间的空格/换行符替换为下划线
                        cleaned = match.strip()
                        # 将中间的空格、换行符、制表符等空白字符替换为下划线
                        cleaned = re.sub(r'\s+', '_', cleaned)
                        # 过滤掉不完整的占位符（包含 { 或 } 的）
                        if cleaned and '{' not in cleaned and '}' not in cleaned:
                            placeholders.add(cleaned)

    except Exception as e:
        print(f"  [错误] 读取文件失败 - {str(e)}")
        import traceback
        traceback.print_exc()

    return placeholders


def generate_mock_data(placeholders: set) -> dict:
    """
    根据占位符生成虚拟数据

    Args:
        placeholders: 占位符集合

    Returns:
        虚拟数据字典
    """
    # 常见字段的虚拟数据映射
    mock_data_map = {
        # 基本信息
        'target_name': '张三',
        'target_organization_and_position': '某市某区某局副局长',
        'target_gender': '男',
        'target_date_of_birth': '198005',
        'target_date_of_birth_full': '1980年05月15日',
        'target_political_status': '中共党员',
        'target_professional_rank': '副处级',
        'target_id_number': '110101198005151234',
        'target_address': '某市某区某街道某小区1号楼101室',
        'target_registered_address': '某市某区某街道某小区1号楼101室',
        'target_contact': '13800138000',
        'target_place_of_origin': '某省某市',
        'target_ethnicity': '汉族',

        # 问题线索
        'clue_source': '群众举报',
        'target_issue_description': '涉嫌违反工作纪律，在项目审批过程中存在不当行为',
        'target_problem_description': '在项目审批过程中，未严格按照规定程序执行，存在程序不规范问题',
        'target_work_basic_info': '2005年参加工作，先后在某局多个科室工作，2018年任现职',
        'target_attitude': '被核查人表示认识到问题的严重性，愿意积极配合调查',

        # 审批信息
        'filler_name': '李四',
        'department_opinion': '同意开展初步核实',
        'approval_time': '2024年12月15日',
        'report_card_request_time': '2024年12月15日',
        'handling_department': '某市纪委监委第一监督检查室',
        'handler_name': '王五',

        # 谈话相关
        'appointment_time': '2024年12月20日上午9:00',
        'appointment_location': '某市纪委监委谈话室',
        'notification_time': '2024年12月18日',
        'notification_location': '某市纪委监委',

        # 核查信息
        'investigation_unit_name': '某市纪委监委',
        'investigation_team_leader_name': '赵六',
        'investigation_team_member_names': '赵六、孙七、周八',
        'investigation_location': '某市纪委监委',
        'investigation_team_code': '2024-001',

        # 其他
        'commission_name': '某市纪律检查委员会',
        'backup_personnel': '钱九',
        'assessment_opinion': '风险评估为低风险，可以开展谈话',
    }

    # 为所有占位符生成数据
    field_data = {}
    for placeholder in placeholders:
        if placeholder in mock_data_map:
            field_data[placeholder] = mock_data_map[placeholder]
        else:
            # 如果占位符不在映射中，生成默认值
            field_data[placeholder] = f'[虚拟数据-{placeholder}]'

    return field_data


def test_template_replacement(template_path: str, output_dir: str):
    """
    测试模板占位符识别和替换

    Args:
        template_path: 模板文件路径
        output_dir: 输出目录
    """
    print(f"\n{'='*80}")
    print(f"测试模板: {template_path}")
    print(f"{'='*80}")

    # 检查模板文件是否存在
    if not os.path.exists(template_path):
        print(f"  [错误] 模板文件不存在: {template_path}")
        return False

    # 提取占位符
    print(f"\n[步骤1] 提取占位符...")
    placeholders = extract_placeholders_from_docx(template_path)
    print(f"  发现 {len(placeholders)} 个不同的占位符:")
    for placeholder in sorted(placeholders):
        print(f"    - {{{{ {placeholder} }}}}")

    if not placeholders:
        print(f"  [警告] 未发现任何占位符，模板可能不需要填充数据")
        return False

    # 生成虚拟数据
    print(f"\n[步骤2] 生成虚拟数据...")
    field_data = generate_mock_data(placeholders)
    print(f"  生成了 {len(field_data)} 个字段的虚拟数据")

    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 使用DocumentService填充模板
    print(f"\n[步骤3] 填充模板...")
    try:
        # 注意：DocumentService需要环境变量配置，但我们可以直接使用fill_template方法
        # 为了测试，我们创建一个简化的填充方法
        doc = Document(template_path)

        placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')

        # 替换段落中的占位符
        for paragraph in doc.paragraphs:
            full_text = paragraph.text
            if not full_text:
                continue

            # 检查是否有占位符
            has_placeholder = False
            for field_code in field_data.keys():
                placeholder = f"{{{{{field_code}}}}}"
                if placeholder in full_text:
                    has_placeholder = True
                    break

            if has_placeholder:
                # 执行替换
                final_text = full_text
                for field_code, field_value in field_data.items():
                    replacement_value = str(field_value) if field_value else ''
                    # 尝试多种格式的占位符替换（处理空格问题）
                    # 标准格式
                    placeholder = f"{{{{{field_code}}}}}"
                    final_text = final_text.replace(placeholder, replacement_value)
                    # 带空格的格式（空格替换为下划线后的字段名）
                    placeholder_with_spaces = f"{{{{ {field_code.replace('_', ' ')} }}}}"
                    if placeholder_with_spaces in final_text:
                        final_text = final_text.replace(placeholder_with_spaces, replacement_value)
                    # 正则表达式匹配（处理各种空格情况）
                    placeholder_pattern_variants = [
                        re.compile(re.escape(f"{{{{ {field_code.replace('_', ' ')} }}}}")),
                        re.compile(re.escape(f"{{{{{field_code.replace('_', ' ')}}}}}")),
                    ]
                    for variant_pattern in placeholder_pattern_variants:
                        final_text = variant_pattern.sub(replacement_value, final_text)

                # 替换段落文本（保持格式）
                if len(paragraph.runs) == 1:
                    paragraph.runs[0].text = final_text
                else:
                    # 多个run的情况：合并为一个run
                    for run in paragraph.runs[1:]:
                        run.text = ''
                    if paragraph.runs:
                        paragraph.runs[0].text = final_text

        # 替换表格中的占位符
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        full_text = paragraph.text
                        if not full_text:
                            continue

                        # 检查是否有占位符
                        has_placeholder = False
                        for field_code in field_data.keys():
                            placeholder = f"{{{{{field_code}}}}}"
                            if placeholder in full_text:
                                has_placeholder = True
                                break

                        if has_placeholder:
                            # 执行替换
                            final_text = full_text
                            for field_code, field_value in field_data.items():
                                replacement_value = str(field_value) if field_value else ''
                                # 尝试多种格式的占位符替换（处理空格问题）
                                # 标准格式
                                placeholder = f"{{{{{field_code}}}}}"
                                final_text = final_text.replace(placeholder, replacement_value)
                                # 带空格的格式（空格替换为下划线后的字段名）
                                placeholder_with_spaces = f"{{{{ {field_code.replace('_', ' ')} }}}}"
                                if placeholder_with_spaces in final_text:
                                    final_text = final_text.replace(placeholder_with_spaces, replacement_value)
                                # 正则表达式匹配（处理各种空格情况）
                                placeholder_pattern_variants = [
                                    re.compile(re.escape(f"{{{{ {field_code.replace('_', ' ')} }}}}")),
                                    re.compile(re.escape(f"{{{{{field_code.replace('_', ' ')}}}}}")),
                                ]
                                for variant_pattern in placeholder_pattern_variants:
                                    final_text = variant_pattern.sub(replacement_value, final_text)

                            # 替换段落文本
                            if len(paragraph.runs) == 1:
                                paragraph.runs[0].text = final_text
                            else:
                                for run in paragraph.runs[1:]:
                                    run.text = ''
                                if paragraph.runs:
                                    paragraph.runs[0].text = final_text

        # 生成输出文件名
        template_name = Path(template_path).stem
        output_file = os.path.join(output_dir, f"{template_name}_已填充.docx")

        # 保存文档
        doc.save(output_file)
        print(f"  [成功] 文档已保存到: {output_file}")

        # 验证替换结果
        print(f"\n[步骤4] 验证替换结果...")
        verify_doc = Document(output_file)
        remaining_placeholders = set()

        # 检查段落
        for paragraph in verify_doc.paragraphs:
            text = paragraph.text
            matches = placeholder_pattern.findall(text)
            for match in matches:
                cleaned = match.strip()
                if cleaned and '{' not in cleaned and '}' not in cleaned:
                    remaining_placeholders.add(cleaned)

        # 检查表格
        for table in verify_doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        text = paragraph.text
                        matches = placeholder_pattern.findall(text)
                        for match in matches:
                            cleaned = match.strip()
                            if cleaned and '{' not in cleaned and '}' not in cleaned:
                                remaining_placeholders.add(cleaned)

        if remaining_placeholders:
            print(f"  [警告] 仍有 {len(remaining_placeholders)} 个占位符未替换:")
            for placeholder in sorted(remaining_placeholders):
                print(f"    - {{{{ {placeholder} }}}}")
        else:
            print(f"  [成功] 所有占位符已成功替换！")

        return True

    except Exception as e:
        print(f"  [错误] 填充模板失败: {str(e)}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """主函数"""
    # 项目根目录
    project_root = Path(__file__).parent

    # 模板文件路径
    template_path = project_root / "template_finish" / "2-初核模版" / "2.谈话审批" / "走读式谈话审批" / "2谈话审批表-重新制作表格.docx"

    # 输出目录
    output_dir = project_root / "output_temp"

    print("="*80)
    print("模板占位符识别和替换测试")
    print("="*80)

    # 测试模板
    success = test_template_replacement(str(template_path), str(output_dir))

    # 总结
    print(f"\n{'='*80}")
    print("测试总结")
    print(f"{'='*80}")
    print(f"模板 (2谈话审批表-重新制作表格.docx): {'[成功]' if success else '[失败]'}")
    print(f"\n输出目录: {output_dir}")
    print(f"{'='*80}\n")


if __name__ == "__main__":
    main()