ai-business-write/process_templates_docx_only.py

"""
处理已转换的 .docx 模板文档，自动添加占位符
此脚本专门处理已经手动转换为 .docx 格式的文档，跳过 .doc 转换步骤
根据占位符与字段对照表，智能识别文档类型并添加相应的占位符
使用AI大模型智能分析文档内容，识别可替换位置
"""
import os
import re
from pathlib import Path
from typing import Dict, List, Optional

try:
    from docx import Document
except ImportError:
    print("错误: 请先安装 python-docx: pip install python-docx")
    exit(1)

# 尝试导入AI辅助工具
try:
    from template_ai_helper import TemplateAIHelper, get_available_fields_for_document
    HAS_AI_HELPER = True
except ImportError:
    HAS_AI_HELPER = False
    print("警告: 无法导入AI辅助工具，将使用基础模式（不使用AI分析）")

# 项目根目录
PROJECT_ROOT = Path(__file__).parent
ORIGINAL_TEMPLATES_DIR = PROJECT_ROOT / "模板" / "原始模板"
OUTPUT_TEMPLATES_DIR = PROJECT_ROOT / "模板"
FIELD_MAPPING_FILE = PROJECT_ROOT / "占位符与字段对照表.md"

# 文档类型映射（根据文件名识别）
DOCUMENT_TYPE_MAPPING = {
    "请示报告卡": {
        "template_code": "REPORT_CARD",
        "fields": ["target_name", "target_organization_and_position", "report_card_request_time"],
        "input_fields": ["clue_info"]
    },
    "初步核实审批表": {
        "template_code": "PRELIMINARY_VERIFICATION_APPROVAL",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth", "target_political_status", "target_professional_rank",
            "clue_source", "target_issue_description", "department_opinion", "filler_name"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "初核方案": {
        "template_code": "INVESTIGATION_PLAN",
        "fields": [
            "target_name", "target_organization_and_position", "target_work_basic_info",
            "target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
            "investigation_team_member_names", "investigation_location"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "附件初核方案": {
        "template_code": "INVESTIGATION_PLAN",
        "fields": [
            "target_name", "target_organization_and_position", "target_work_basic_info",
            "target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
            "investigation_team_member_names", "investigation_location"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话通知书": {
        "template_code": "NOTIFICATION_LETTER",
        "fields": [
            "target_name", "target_organization_and_position", "target_id_number",
            "appointment_time", "appointment_location", "approval_time",
            "handling_department", "handler_name", "notification_time", "notification_location"
        ],
        "input_fields": ["target_basic_info_clue"]
    },
    "谈话笔录": {
        "template_code": "INTERVIEW_RECORD",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "谈话询问对象情况摸底调查30问": {
        "template_code": "INVESTIGATION_30_QUESTIONS",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "被谈话人权利义务告知书": {
        "template_code": "RIGHTS_OBLIGATIONS_NOTICE",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "点对点交接单": {
        "template_code": "HANDOVER_FORM",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "陪送交接单": {
        "template_code": "ESCORT_HANDOVER_FORM",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "保密承诺书": {
        "template_code": "CONFIDENTIALITY_COMMITMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "办案人员-办案安全保密承诺书": {
        "template_code": "INVESTIGATOR_CONFIDENTIALITY_COMMITMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "请示报告卡（初核报告结论）": {
        "template_code": "REPORT_CARD_CONCLUSION",
        "fields": [
            "investigation_team_code", "target_name", "target_problem_description", "target_attitude"
        ],
        "input_fields": []
    },
    "初核情况报告": {
        "template_code": "INVESTIGATION_REPORT",
        "fields": [
            "target_name", "commission_name", "target_work_basic_info",
            "target_issue_description", "target_problem_description", "target_organization_and_position"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话审批表": {
        "template_code": "INTERVIEW_APPROVAL_FORM",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话前安全风险评估表": {
        "template_code": "PRE_INTERVIEW_RISK_ASSESSMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话方案": {
        "template_code": "INTERVIEW_PLAN",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话后安全风险评估表": {
        "template_code": "POST_INTERVIEW_RISK_ASSESSMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    }
}

# 字段名称到字段编码的映射（用于智能识别）
FIELD_NAME_TO_CODE = {
    "被核查人姓名": "target_name",
    "被核查人员单位及职务": "target_organization_and_position",
    "被核查人员性别": "target_gender",
    "被核查人员出生年月": "target_date_of_birth",
    "被核查人员出生年月日": "target_date_of_birth_full",
    "被核查人员政治面貌": "target_political_status",
    "被核查人员职级": "target_professional_rank",
    "被核查人员身份证号": "target_id_number",
    "被核查人员身份证件及号码": "target_id_number",
    "被核查人员住址": "target_address",
    "被核查人员户籍住址": "target_registered_address",
    "被核查人员联系方式": "target_contact",
    "被核查人员籍贯": "target_place_of_origin",
    "被核查人员民族": "target_ethnicity",
    "线索来源": "clue_source",
    "主要问题线索": "target_issue_description",
    "被核查人问题描述": "target_problem_description",
    "被核查人员工作基本情况": "target_work_basic_info",
    "核查单位名称": "investigation_unit_name",
    "核查组组长姓名": "investigation_team_leader_name",
    "核查组成员姓名": "investigation_team_member_names",
    "核查地点": "investigation_location",
    "核查组代号": "investigation_team_code",
    "应到时间": "appointment_time",
    "应到地点": "appointment_location",
    "批准时间": "approval_time",
    "承办部门": "handling_department",
    "承办人": "handler_name",
    "谈话通知时间": "notification_time",
    "谈话通知地点": "notification_location",
    "请示报告卡请示时间": "report_card_request_time",
    "初步核实审批表承办部门意见": "department_opinion",
    "初步核实审批表填表人": "filler_name",
    "被核查人员本人认识和态度": "target_attitude",
    "纪委名称": "commission_name"
}


def identify_document_type(file_name: str) -> Optional[Dict]:
    """
    根据文件名识别文档类型

    Args:
        file_name: 文件名

    Returns:
        文档类型配置，如果无法识别返回None
    """
    # 移除扩展名和常见后缀
    base_name = Path(file_name).stem
    base_name = base_name.replace("（XXX）", "").replace("(XXX)", "").replace("XXX", "")
    base_name = base_name.replace("_转自DOC", "").replace("转自DOC", "")
    base_name = base_name.replace("模板", "").strip()

    # 尝试精确匹配
    for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
        if doc_type in base_name:
            return config

    # 如果无法精确匹配，尝试部分匹配
    for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
        keywords = doc_type.replace("（", " ").replace("）", " ").replace("(", " ").replace(")", " ").split()
        if any(keyword in base_name for keyword in keywords if len(keyword) > 1):
            return config

    return None


def apply_ai_replacements(text: str, ai_replacements: List[Dict]) -> str:
    """
    应用AI识别的替换建议

    Args:
        text: 原始文本
        ai_replacements: AI识别的替换建议列表

    Returns:
        替换后的文本
    """
    result_text = text

    # 按置信度排序，优先处理高置信度的替换
    sorted_replacements = sorted(ai_replacements, key=lambda x: x.get('confidence', 0), reverse=True)

    for replacement in sorted_replacements:
        original = replacement.get('original_text', '')
        replacement_text = replacement.get('replacement', '')
        confidence = replacement.get('confidence', 0)

        # 只应用置信度大于0.7的替换
        if confidence > 0.7 and original and replacement_text:
            # 转义特殊字符
            escaped_original = re.escape(original)
            # 替换（只替换第一次出现，避免重复替换）
            if escaped_original in result_text:
                result_text = result_text.replace(original, replacement_text, 1)

    return result_text


def process_document(input_path: Path, output_path: Path, doc_config: Dict, use_ai: bool = True) -> bool:
    """
    处理单个文档，添加占位符

    Args:
        input_path: 输入文件路径（.docx格式）
        output_path: 输出文件路径
        doc_config: 文档配置
        use_ai: 是否使用AI分析（默认True）

    Returns:
        是否处理成功
    """
    try:
        # 只处理 .docx 文件
        if input_path.suffix.lower() != '.docx':
            print(f"  ⚠ 跳过: 不是 .docx 文件 ({input_path.suffix})")
            return False

        # 检查文件是否存在
        if not input_path.exists():
            print(f"  ✗ 错误: 文件不存在: {input_path}")
            return False

        print(f"  处理: {input_path.name}")

        # 初始化AI助手（如果可用）
        ai_helper = None
        available_fields = []
        if use_ai and HAS_AI_HELPER:
            try:
                print(f"  [初始化] 正在初始化AI助手...")
                ai_helper = TemplateAIHelper()

                # 测试API连接
                if not ai_helper.test_api_connection():
                    print(f"  [初始化] ⚠ API连接测试失败，将使用基础模式")
                    ai_helper = None
                else:
                    available_fields = get_available_fields_for_document(doc_config, FIELD_NAME_TO_CODE)
                    print(f"  [初始化] ✓ AI分析已启用（可用字段: {len(available_fields)} 个）")
            except Exception as e:
                print(f"  [初始化] ⚠ AI分析不可用: {e}，将使用基础模式")
                import traceback
                traceback.print_exc()
                ai_helper = None

        # 打开文档
        print(f"  [读取] 正在打开文档...")
        doc = Document(str(input_path))

        # 统计信息
        total_paragraphs = len([p for p in doc.paragraphs if p.text.strip()])
        total_tables = len(doc.tables)
        total_cells = sum(len(table.rows) * len(table.rows[0].cells) if table.rows else 0 for table in doc.tables)

        print(f"  [统计] 文档包含: {total_paragraphs} 个段落, {total_tables} 个表格, 约 {total_cells} 个单元格")

        # 统计替换次数
        replacement_count = 0
        ai_replacement_count = 0

        # 处理段落中的占位符
        print(f"  [处理] 开始处理段落...")
        for para_idx, paragraph in enumerate(doc.paragraphs):
            if not paragraph.text:
                continue

            text = paragraph.text
            original_text = text

            # 首先使用AI分析（如果可用）
            if ai_helper and available_fields:
                try:
                    doc_type = doc_config.get('template_code', '未知')
                    if para_idx % 10 == 0:  # 每10个段落输出一次进度
                        print(f"    [进度] 处理段落 {para_idx+1}/{total_paragraphs}...")

                    ai_replacements = ai_helper.analyze_paragraph(
                        text,
                        available_fields,
                        doc_type
                    )

                    if ai_replacements:
                        # 应用AI识别的替换
                        text = apply_ai_replacements(text, ai_replacements)
                        if text != original_text:
                            ai_replacement_count += len(ai_replacements)
                            print(f"    [AI] 段落 {para_idx+1} 应用了 {len(ai_replacements)} 个替换")
                except Exception as e:
                    print(f"    [AI] ⚠ 段落 {para_idx+1} AI分析失败: {e}")

            # 然后使用规则匹配（作为补充）
            for field_code in doc_config.get('fields', []):
                # 查找字段名称
                for field_name, code in FIELD_NAME_TO_CODE.items():
                    if code == field_code:
                        # 模式1: 字段名称: XXX 或 字段名称: 具体值
                        pattern1 = rf"({re.escape(field_name)}[：:]\s*)([^\n\r{{]+?)(\s|$|\n|\r|，|。)"
                        def replace_func1(match):
                            value = match.group(2).strip()
                            # 如果值不是占位符格式，且不是空值，则替换
                            if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]:
                                return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
                            return match.group(0)
                        text = re.sub(pattern1, replace_func1, text)

                        # 模式2: 直接替换常见的占位符（XXX）
                        pattern2 = rf"({re.escape(field_name)}[：:]\s*)(XXX|xxx|待填|待填写)"
                        text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
                        break

            if text != original_text:
                # 替换整个段落文本
                paragraph.clear()
                paragraph.add_run(text)
                replacement_count += 1

        # 处理表格中的占位符
        print(f"  [处理] 开始处理表格...")
        for table_idx, table in enumerate(doc.tables):
            if table_idx % 5 == 0:  # 每5个表格输出一次进度
                print(f"    [进度] 处理表格 {table_idx+1}/{total_tables}...")
            for row_idx, row in enumerate(table.rows):
                for col_idx, cell in enumerate(row.cells):
                    for paragraph in cell.paragraphs:
                        if not paragraph.text:
                            continue

                        text = paragraph.text
                        original_text = text

                        # 首先使用AI分析（如果可用）
                        if ai_helper and available_fields:
                            try:
                                doc_type = doc_config.get('template_code', '未知')
                                ai_replacements = ai_helper.analyze_table_cell(
                                    text,
                                    available_fields,
                                    doc_type,
                                    row_idx,
                                    col_idx
                                )

                                if ai_replacements:
                                    # 应用AI识别的替换
                                    text = apply_ai_replacements(text, ai_replacements)
                                    if text != original_text:
                                        ai_replacement_count += len(ai_replacements)
                            except Exception as e:
                                pass  # 静默失败，继续使用规则匹配

                        # 然后使用规则匹配（作为补充）
                        for field_code in doc_config.get('fields', []):
                            for field_name, code in FIELD_NAME_TO_CODE.items():
                                if code == field_code:
                                    # 模式1: 字段名称: XXX 或 字段名称: 具体值
                                    pattern1 = rf"({re.escape(field_name)}[：:]\s*)([^\n\r{{]+?)(\s|$|\n|\r|，|。)"
                                    def replace_func1(match):
                                        value = match.group(2).strip()
                                        if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]:
                                            return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
                                        return match.group(0)
                                    text = re.sub(pattern1, replace_func1, text)

                                    # 模式2: 直接替换常见的占位符（XXX）
                                    pattern2 = rf"({re.escape(field_name)}[：:]\s*)(XXX|xxx|待填|待填写)"
                                    text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
                                    break

                        if text != original_text:
                            paragraph.clear()
                            paragraph.add_run(text)
                            replacement_count += 1

        # 确保输出目录存在
        print(f"  [保存] 正在保存文档...")
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # 保存文档
        doc.save(str(output_path))
        print(f"  [保存] ✓ 文档已保存到: {output_path}")

        if replacement_count > 0 or ai_replacement_count > 0:
            msg = f"  ✓ 处理成功"
            if ai_replacement_count > 0:
                msg += f"，AI识别 {ai_replacement_count} 处"
            if replacement_count > 0:
                msg += f"，规则匹配 {replacement_count} 处"
            print(msg)
        else:
            print(f"  ⚠ 处理完成，但未找到需要替换的内容（可能已包含占位符）")

        return True

    except Exception as e:
        print(f"  ✗ 处理失败: {e}")
        import traceback
        traceback.print_exc()
        return False


def process_all_templates():
    """
    处理所有已转换的 .docx 模板文件
    """
    print("="*80)
    print("处理已转换的 .docx 模板文档（跳过 .doc 转换）")
    print("="*80)
    print()

    if not ORIGINAL_TEMPLATES_DIR.exists():
        print(f"错误: 原始模板目录不存在: {ORIGINAL_TEMPLATES_DIR}")
        return

    # 统计信息
    processed_count = 0
    skipped_count = 0
    failed_count = 0

    # 统计总文件数
    all_files = []
    for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
        for file in files:
            if file.endswith('.docx'):
                all_files.append(Path(root) / file)

    total_files = len(all_files)
    print(f"找到 {total_files} 个 .docx 文件需要处理\n")

    # 遍历所有文件，只处理 .docx 文件
    file_index = 0
    for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
        for file in files:
            # 只处理 .docx 文件，跳过 .doc 文件
            if not file.endswith('.docx'):
                continue

            file_index += 1
            input_path = Path(root) / file

            # 识别文档类型
            doc_config = identify_document_type(file)

            if not doc_config:
                print(f"\n⚠ 无法识别文档类型: {file}")
                print(f"  路径: {input_path}")
                skipped_count += 1
                continue

            # 生成输出路径（保持相对目录结构）
            relative_path = input_path.relative_to(ORIGINAL_TEMPLATES_DIR)
            # 清理文件名（移除转换标记）
            clean_name = Path(file).stem
            clean_name = clean_name.replace("_转自DOC", "").replace("转自DOC", "")
            clean_name = clean_name.replace("（XXX）", "").replace("(XXX)", "").replace("XXX", "")
            output_path = OUTPUT_TEMPLATES_DIR / relative_path.parent / f"{clean_name}.docx"

            print(f"\n{'='*80}")
            print(f"[{file_index}/{total_files}] 处理: {file}")
            print(f"{'='*80}")
            print(f"  类型: {doc_config.get('template_code', 'UNKNOWN')}")
            print(f"  输入: {input_path}")
            print(f"  输出: {output_path}")

            # 处理文档（使用AI分析）
            if process_document(input_path, output_path, doc_config, use_ai=True):
                processed_count += 1
            else:
                failed_count += 1

    # 输出统计信息
    print("\n" + "="*80)
    print("处理完成")
    print("="*80)
    print(f"成功处理: {processed_count} 个文件")
    print(f"跳过: {skipped_count} 个文件（无法识别类型）")
    print(f"失败: {failed_count} 个文件")
    print(f"\n处理后的模板保存在: {OUTPUT_TEMPLATES_DIR}")
    print("\n请检查生成的模板文件，确认占位符是否正确添加。")
    print("如有需要，请手动调整占位符位置。")


if __name__ == '__main__':
    process_all_templates()