ai-business-write/test_template_reading_and_placeholder.py

"""
直接测试模板读取和占位符识别功能（不依赖API服务）
1. 测试所有模板文件是否能正确读取
2. 验证占位符识别功能
3. 测试占位符替换功能（使用DocumentService）
"""
import os
import sys
from pathlib import Path
from typing import Dict, List, Set
from dotenv import load_dotenv
import re
from docx import Document

# 添加项目根目录到路径
PROJECT_ROOT = Path(__file__).parent
sys.path.insert(0, str(PROJECT_ROOT))

from services.document_service import DocumentService

# 加载环境变量
load_dotenv()

TEMPLATES_DIR = PROJECT_ROOT / "template_finish"


def print_section(title):
    """打印章节标题"""
    print("\n" + "="*70)
    print(f"  {title}")
    print("="*70)


def print_result(success, message):
    """打印结果"""
    status = "[OK]" if success else "[FAIL]"
    print(f"{status} {message}")


def scan_local_templates(base_dir: Path) -> Dict[str, Path]:
    """扫描本地模板文件"""
    templates = {}
    if not base_dir.exists():
        return templates

    for file_path in base_dir.rglob('*'):
        if file_path.is_file() and file_path.suffix.lower() in ['.docx']:
            relative_path = file_path.relative_to(PROJECT_ROOT)
            relative_path_str = str(relative_path).replace('\\', '/')
            templates[relative_path_str] = file_path

    return templates


def extract_placeholders_from_docx(file_path: Path) -> tuple[Set[str], bool, str]:
    """从docx文件中提取所有占位符"""
    placeholders = set()
    placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
    error_msg = ""

    try:
        doc = Document(file_path)

        # 从段落中提取
        for paragraph in doc.paragraphs:
            text = paragraph.text
            matches = placeholder_pattern.findall(text)
            for match in matches:
                field_code = match.strip()
                if field_code:
                    placeholders.add(field_code)

        # 从表格中提取
        for table in doc.tables:
            try:
                for row in table.rows:
                    for cell in row.cells:
                        for paragraph in cell.paragraphs:
                            text = paragraph.text
                            matches = placeholder_pattern.findall(text)
                            for match in matches:
                                field_code = match.strip()
                                if field_code:
                                    placeholders.add(field_code)
            except Exception as e:
                # 某些表格结构可能导致错误，跳过
                continue

        return placeholders, True, ""
    except Exception as e:
        error_msg = str(e)
        return placeholders, False, error_msg


def test_template_reading_and_placeholders():
    """测试模板读取和占位符识别"""
    print_section("测试模板读取和占位符识别")

    # 1. 扫描模板
    print_section("1. 扫描本地模板文件")
    local_templates = scan_local_templates(TEMPLATES_DIR)
    print_result(True, f"找到 {len(local_templates)} 个.docx模板文件")

    if not local_templates:
        print_result(False, "未找到模板文件")
        return

    # 2. 测试每个模板
    print_section("2. 测试模板读取和占位符识别")

    all_placeholders = set()
    read_success_count = 0
    read_failed_count = 0
    with_placeholders_count = 0
    without_placeholders_count = 0
    template_results = []

    for i, (rel_path, file_path) in enumerate(local_templates.items(), 1):
        placeholders, success, error = extract_placeholders_from_docx(file_path)

        if success:
            read_success_count += 1
            all_placeholders.update(placeholders)
            if placeholders:
                with_placeholders_count += 1
            else:
                without_placeholders_count += 1
        else:
            read_failed_count += 1

        template_results.append({
            'path': rel_path,
            'name': file_path.name,
            'read_success': success,
            'placeholders': placeholders,
            'error': error
        })

        # 每10个模板显示一次进度
        if i % 10 == 0:
            print(f"  已处理: {i}/{len(local_templates)}")

    # 3. 统计结果
    print_section("3. 测试结果统计")
    print(f"  总模板数: {len(local_templates)}")
    print(f"  读取成功: {read_success_count}")
    print(f"  读取失败: {read_failed_count}")
    print(f"  有占位符: {with_placeholders_count}")
    print(f"  无占位符: {without_placeholders_count}")
    print(f"  发现的占位符总数: {len(all_placeholders)} 个不同的占位符")

    if read_failed_count > 0:
        print(f"\n  读取失败的模板:")
        for result in template_results:
            if not result['read_success']:
                print(f"    - {result['name']}: {result['error']}")

    # 4. 显示所有占位符
    print_section("4. 所有占位符列表")
    if all_placeholders:
        for placeholder in sorted(all_placeholders):
            print(f"    - {placeholder}")
    else:
        print("    未发现占位符")

    # 5. 测试DocumentService的模板读取功能
    print_section("5. 测试DocumentService模板读取功能")

    try:
        document_service = DocumentService()
        print_result(True, "DocumentService初始化成功")

        # 选择几个有占位符的模板进行测试
        test_templates = [r for r in template_results if r['read_success'] and r['placeholders']][:3]

        if test_templates:
            print(f"\n  测试 {len(test_templates)} 个模板的读取功能:")

            for template_result in test_templates:
                rel_path = template_result['path']
                placeholders = template_result['placeholders']

                print(f"\n    模板: {template_result['name']}")
                print(f"      路径: {rel_path}")
                print(f"      占位符: {sorted(placeholders)}")

                try:
                    # 测试download_template_from_minio方法（现在从本地读取）
                    temp_path = document_service.download_template_from_minio(rel_path)

                    if temp_path and Path(temp_path).exists():
                        print_result(True, f"模板读取成功: {temp_path}")

                        # 验证文件内容
                        try:
                            doc = Document(temp_path)
                            print(f"      文档段落数: {len(doc.paragraphs)}")
                            print(f"      文档表格数: {len(doc.tables)}")

                            # 清理临时文件
                            try:
                                Path(temp_path).unlink()
                            except:
                                pass
                        except Exception as e:
                            print_result(False, f"验证文档内容失败: {str(e)}")
                    else:
                        print_result(False, "模板读取失败：文件不存在")
                except Exception as e:
                    print_result(False, f"模板读取失败: {str(e)}")
        else:
            print_result(False, "没有找到有占位符的模板进行测试")

    except Exception as e:
        print_result(False, f"DocumentService初始化失败: {str(e)}")
        import traceback
        traceback.print_exc()

    # 6. 测试占位符替换功能
    print_section("6. 测试占位符替换功能")

    try:
        document_service = DocumentService()

        # 选择一个有占位符的模板
        test_template = None
        for template_result in template_results:
            if template_result['read_success'] and template_result['placeholders']:
                test_template = template_result
                break

        if test_template:
            rel_path = test_template['path']
            placeholders = test_template['placeholders']

            print(f"  测试模板: {test_template['name']}")
            print(f"    路径: {rel_path}")
            print(f"    占位符: {sorted(placeholders)}")

            # 准备测试数据
            field_data = {}
            test_values = {
                'target_name': '测试姓名',
                'target_organization': '测试单位',
                'target_position': '测试职务',
                'target_organization_and_position': '测试单位-测试职务',
                'investigation_team_code': 'DC2025001',
                'appointment_time': '2025-12-16 14:00',
                'appointment_location': '会议室A',
            }

            for placeholder in placeholders:
                field_data[placeholder] = test_values.get(placeholder, f'测试值_{placeholder}')

            print(f"    测试数据: {field_data}")

            try:
                # 读取模板
                template_path = document_service.download_template_from_minio(rel_path)

                if template_path and Path(template_path).exists():
                    # 填充模板
                    filled_doc_path = document_service.fill_template(template_path, field_data)

                    if filled_doc_path and Path(filled_doc_path).exists():
                        print_result(True, f"文档生成成功: {filled_doc_path}")

                        # 验证生成的文档
                        try:
                            filled_doc = Document(filled_doc_path)

                            # 检查是否还有未替换的占位符
                            remaining_placeholders = set()
                            placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')

                            for paragraph in filled_doc.paragraphs:
                                text = paragraph.text
                                matches = placeholder_pattern.findall(text)
                                for match in matches:
                                    field_code = match.strip()
                                    if field_code:
                                        remaining_placeholders.add(field_code)

                            if remaining_placeholders:
                                print_result(False, f"仍有未替换的占位符: {sorted(remaining_placeholders)}")
                            else:
                                print_result(True, "所有占位符已成功替换")

                            # 清理临时文件
                            try:
                                Path(template_path).unlink()
                                Path(filled_doc_path).unlink()
                            except:
                                pass
                        except Exception as e:
                            print_result(False, f"验证生成的文档失败: {str(e)}")
                    else:
                        print_result(False, "文档生成失败：文件不存在")
                else:
                    print_result(False, "模板读取失败：文件不存在")
            except Exception as e:
                print_result(False, f"占位符替换测试失败: {str(e)}")
                import traceback
                traceback.print_exc()
        else:
            print_result(False, "没有找到有占位符的模板进行测试")

    except Exception as e:
        print_result(False, f"占位符替换测试初始化失败: {str(e)}")
        import traceback
        traceback.print_exc()

    print_section("测试完成")


if __name__ == "__main__":
    test_template_reading_and_placeholders()