ai-business-write/process_templates.py

"""
处理原始模板文档，自动添加占位符
根据占位符与字段对照表，智能识别文档类型并添加相应的占位符
使用AI大模型智能分析文档内容，识别可替换位置
"""
import os
import re
from pathlib import Path
from typing import Dict, List, Optional
import json

try:
    from docx import Document
    from docx.shared import Pt
except ImportError:
    print("错误: 请先安装 python-docx: pip install python-docx")
    exit(1)

# 尝试导入AI辅助工具
try:
    from template_ai_helper import TemplateAIHelper, get_available_fields_for_document
    HAS_AI_HELPER = True
except ImportError:
    HAS_AI_HELPER = False
    print("警告: 无法导入AI辅助工具，将使用基础模式（不使用AI分析）")

# 尝试导入win32com用于.doc文件转换（Windows系统）
HAS_WIN32COM = False
HAS_PYTHONCOM = False
try:
    import win32com.client
    HAS_WIN32COM = True
    try:
        import pythoncom
        HAS_PYTHONCOM = True
    except ImportError:
        pass
except ImportError:
    pass

if not HAS_WIN32COM:
    print("="*60)
    print("警告: 未安装 pywin32，无法自动转换 .doc 文件")
    print("="*60)
    print("解决方案:")
    print("  1. 安装 pywin32: pip install pywin32")
    print("  2. 或者手动将所有 .doc 文件转换为 .docx 格式")
    print("  3. 转换后重新运行此脚本")
    print("="*60)

# 项目根目录
PROJECT_ROOT = Path(__file__).parent
ORIGINAL_TEMPLATES_DIR = PROJECT_ROOT / "模板" / "原始模板"
OUTPUT_TEMPLATES_DIR = PROJECT_ROOT / "模板"
FIELD_MAPPING_FILE = PROJECT_ROOT / "占位符与字段对照表.md"

# 文档类型映射（根据文件名识别）
DOCUMENT_TYPE_MAPPING = {
    "请示报告卡": {
        "template_code": "REPORT_CARD",
        "fields": ["target_name", "target_organization_and_position", "report_card_request_time"],
        "input_fields": ["clue_info"]
    },
    "初步核实审批表": {
        "template_code": "PRELIMINARY_VERIFICATION_APPROVAL",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth", "target_political_status", "target_professional_rank",
            "clue_source", "target_issue_description", "department_opinion", "filler_name"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "初核方案": {
        "template_code": "INVESTIGATION_PLAN",
        "fields": [
            "target_name", "target_organization_and_position", "target_work_basic_info",
            "target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
            "investigation_team_member_names", "investigation_location"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话通知书": {
        "template_code": "NOTIFICATION_LETTER",
        "fields": [
            "target_name", "target_organization_and_position", "target_id_number",
            "appointment_time", "appointment_location", "approval_time",
            "handling_department", "handler_name", "notification_time", "notification_location"
        ],
        "input_fields": ["target_basic_info_clue"]
    },
    "谈话笔录": {
        "template_code": "INTERVIEW_RECORD",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "谈话询问对象情况摸底调查30问": {
        "template_code": "INVESTIGATION_30_QUESTIONS",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "被谈话人权利义务告知书": {
        "template_code": "RIGHTS_OBLIGATIONS_NOTICE",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "点对点交接单": {
        "template_code": "HANDOVER_FORM",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "陪送交接单": {
        "template_code": "ESCORT_HANDOVER_FORM",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "保密承诺书": {
        "template_code": "CONFIDENTIALITY_COMMITMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "办案人员-办案安全保密承诺书": {
        "template_code": "INVESTIGATOR_CONFIDENTIALITY_COMMITMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": []
    },
    "请示报告卡（初核报告结论）": {
        "template_code": "REPORT_CARD_CONCLUSION",
        "fields": [
            "investigation_team_code", "target_name", "target_problem_description", "target_attitude"
        ],
        "input_fields": []
    },
    "初核情况报告": {
        "template_code": "INVESTIGATION_REPORT",
        "fields": [
            "target_name", "commission_name", "target_work_basic_info",
            "target_issue_description", "target_problem_description", "target_organization_and_position"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话审批表": {
        "template_code": "INTERVIEW_APPROVAL_FORM",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话前安全风险评估表": {
        "template_code": "PRE_INTERVIEW_RISK_ASSESSMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话方案": {
        "template_code": "INTERVIEW_PLAN",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    },
    "谈话后安全风险评估表": {
        "template_code": "POST_INTERVIEW_RISK_ASSESSMENT",
        "fields": [
            "target_name", "target_organization_and_position", "target_gender",
            "target_date_of_birth_full", "target_political_status", "target_address",
            "target_registered_address", "target_contact", "target_place_of_origin",
            "target_ethnicity", "target_id_number", "investigation_team_code"
        ],
        "input_fields": ["clue_info", "target_basic_info_clue"]
    }
}

# 字段名称到字段编码的映射（用于智能识别）
FIELD_NAME_TO_CODE = {
    "被核查人姓名": "target_name",
    "被核查人员单位及职务": "target_organization_and_position",
    "被核查人员性别": "target_gender",
    "被核查人员出生年月": "target_date_of_birth",
    "被核查人员出生年月日": "target_date_of_birth_full",
    "被核查人员政治面貌": "target_political_status",
    "被核查人员职级": "target_professional_rank",
    "被核查人员身份证号": "target_id_number",
    "被核查人员身份证件及号码": "target_id_number",
    "被核查人员住址": "target_address",
    "被核查人员户籍住址": "target_registered_address",
    "被核查人员联系方式": "target_contact",
    "被核查人员籍贯": "target_place_of_origin",
    "被核查人员民族": "target_ethnicity",
    "线索来源": "clue_source",
    "主要问题线索": "target_issue_description",
    "被核查人问题描述": "target_problem_description",
    "被核查人员工作基本情况": "target_work_basic_info",
    "核查单位名称": "investigation_unit_name",
    "核查组组长姓名": "investigation_team_leader_name",
    "核查组成员姓名": "investigation_team_member_names",
    "核查地点": "investigation_location",
    "核查组代号": "investigation_team_code",
    "应到时间": "appointment_time",
    "应到地点": "appointment_location",
    "批准时间": "approval_time",
    "承办部门": "handling_department",
    "承办人": "handler_name",
    "谈话通知时间": "notification_time",
    "谈话通知地点": "notification_location",
    "请示报告卡请示时间": "report_card_request_time",
    "初步核实审批表承办部门意见": "department_opinion",
    "初步核实审批表填表人": "filler_name",
    "被核查人员本人认识和态度": "target_attitude",
    "纪委名称": "commission_name"
}


def convert_doc_to_docx(doc_path: Path) -> Optional[Path]:
    """
    将.doc文件转换为.docx格式（Windows系统使用win32com）

    Args:
        doc_path: .doc文件路径

    Returns:
        转换后的.docx文件路径，如果失败返回None
    """
    if not HAS_WIN32COM:
        print(f"  警告: 未安装 pywin32，无法转换 {doc_path.name}")
        print(f"  解决方案: pip install pywin32")
        print(f"  或者: 请手动将 {doc_path.name} 转换为 .docx 格式")
        return None

    word = None
    doc = None

    try:
        # 初始化COM（如果可用）
        if HAS_PYTHONCOM:
            pythoncom.CoInitialize()

        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        word.DisplayAlerts = 0  # 不显示警告

        docx_path = doc_path.with_suffix('.docx')

        # 检查源文件是否存在
        if not doc_path.exists():
            print(f"  ✗ 错误: 源文件不存在: {doc_path}")
            if word:
                word.Quit()
            return None

        # 打开.doc文件（使用绝对路径）
        abs_doc_path = str(doc_path.absolute())
        abs_docx_path = str(docx_path.absolute())

        print(f"    正在转换...")
        print(f"    源: {doc_path.name}")
        print(f"    目标: {docx_path.name}")

        # 打开文档
        doc = word.Documents.Open(
            abs_doc_path,
            ReadOnly=True,
            ConfirmConversions=False,
            AddToRecentFiles=False
        )

        # 另存为.docx格式 (16 = wdFormatXMLDocument)
        doc.SaveAs2(
            abs_docx_path,
            FileFormat=16  # wdFormatXMLDocument
        )

        # 关闭文档
        doc.Close(False)  # False表示不保存更改
        doc = None

        # 退出Word
        word.Quit()
        word = None

        # 检查转换后的文件是否存在
        if docx_path.exists() and docx_path.stat().st_size > 0:
            file_size = docx_path.stat().st_size
            print(f"  ✓ 转换成功 ({file_size} 字节)")
            return docx_path
        else:
            print(f"  ✗ 转换失败: 目标文件不存在或为空")
            return None

    except Exception as e:
        error_msg = str(e)
        error_type = type(e).__name__
        print(f"  ✗ 转换失败: {error_type}: {error_msg}")

        # 清理资源
        try:
            if doc:
                doc.Close(False)
        except:
            pass
        try:
            if word:
                word.Quit()
        except:
            pass

        # 提供更详细的错误信息和解决方案
        print(f"  诊断信息:")
        if "Word.Application" in error_msg or "COM" in error_msg or "CreateObject" in error_msg:
            print(f"    - 可能原因: Microsoft Word 未安装或无法访问")
            print(f"    - 解决方案:")
            print(f"      1. 确保已安装 Microsoft Word（不是 WPS）")
            print(f"      2. 手动将 .doc 文件转换为 .docx 格式")
            print(f"      3. 使用 Word 打开文件，另存为 .docx 格式")
        elif "pywin32" in error_msg.lower() or "win32com" in error_msg.lower():
            print(f"    - 解决方案: pip install pywin32")
        elif "权限" in error_msg or "Permission" in error_msg:
            print(f"    - 可能原因: 文件被其他程序占用或权限不足")
            print(f"    - 解决方案: 关闭文件，检查文件权限")
        else:
            print(f"    - 请检查错误信息并手动转换文件")

        return None
    finally:
        # 清理COM
        if HAS_PYTHONCOM:
            try:
                pythoncom.CoUninitialize()
            except:
                pass


def identify_document_type(file_name: str) -> Optional[Dict]:
    """
    根据文件名识别文档类型

    Args:
        file_name: 文件名

    Returns:
        文档类型配置，如果无法识别返回None
    """
    # 移除扩展名和常见后缀
    base_name = Path(file_name).stem
    base_name = base_name.replace("（XXX）", "").replace("(XXX)", "").replace("XXX", "")
    base_name = base_name.strip()

    # 尝试匹配文档类型
    for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
        if doc_type in base_name:
            return config

    # 如果无法精确匹配，尝试部分匹配
    for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
        if any(keyword in base_name for keyword in doc_type.split()):
            return config

    return None


def find_placeholder_positions(text: str, field_name: str, field_code: str) -> List[tuple]:
    """
    在文本中查找可能需要替换为占位符的位置

    Args:
        text: 文本内容
        field_name: 字段名称
        field_code: 字段编码

    Returns:
        找到的位置列表 (start, end, replacement_text)
    """
    positions = []

    # 查找字段名称后的内容
    pattern = rf"{re.escape(field_name)}[：:]\s*([^\n\r]+)"
    matches = re.finditer(pattern, text)

    for match in matches:
        value = match.group(1).strip()
        # 如果值不是占位符格式，且不是空值，则可能需要替换
        if value and not value.startswith("{{"):
            # 跳过常见的示例值
            if value not in ["XXX", "xxx", "-", "——", "——", "待填", "待填写"]:
                positions.append((
                    match.start(1),
                    match.end(1),
                    f"{{{{{field_code}}}}}"
                ))

    return positions


def replace_text_in_runs(runs, old_text: str, new_text: str) -> bool:
    """
    在runs中替换文本

    Args:
        runs: 文本runs列表
        old_text: 要替换的旧文本
        new_text: 新文本

    Returns:
        是否进行了替换
    """
    full_text = ''.join(run.text for run in runs)
    if old_text not in full_text:
        return False

    # 找到包含旧文本的runs
    current_pos = 0
    for run in runs:
        run_start = current_pos
        run_end = current_pos + len(run.text)

        if run_start <= full_text.find(old_text) < run_end:
            # 在这个run中替换
            run.text = run.text.replace(old_text, new_text)
            return True

        current_pos = run_end

    return False


def apply_ai_replacements(text: str, ai_replacements: List[Dict]) -> str:
    """
    应用AI识别的替换建议

    Args:
        text: 原始文本
        ai_replacements: AI识别的替换建议列表

    Returns:
        替换后的文本
    """
    result_text = text

    # 按置信度排序，优先处理高置信度的替换
    sorted_replacements = sorted(ai_replacements, key=lambda x: x.get('confidence', 0), reverse=True)

    for replacement in sorted_replacements:
        original = replacement.get('original_text', '')
        replacement_text = replacement.get('replacement', '')
        confidence = replacement.get('confidence', 0)

        # 只应用置信度大于0.7的替换
        if confidence > 0.7 and original and replacement_text:
            # 转义特殊字符
            escaped_original = re.escape(original)
            # 替换（只替换第一次出现，避免重复替换）
            if escaped_original in result_text:
                result_text = result_text.replace(original, replacement_text, 1)

    return result_text


def process_document(input_path: Path, output_path: Path, doc_config: Dict, use_ai: bool = True) -> bool:
    """
    处理单个文档，添加占位符

    Args:
        input_path: 输入文件路径
        output_path: 输出文件路径
        doc_config: 文档配置
        use_ai: 是否使用AI分析（默认True）

    Returns:
        是否处理成功
    """
    try:
        # 如果是.doc文件，先转换为.docx
        if input_path.suffix.lower() == '.doc':
            print(f"  转换 .doc 到 .docx: {input_path.name}")
            docx_path = convert_doc_to_docx(input_path)
            if not docx_path or not docx_path.exists():
                print(f"  ⚠ 跳过: 无法转换 {input_path.name}")
                return False
            input_path = docx_path

        # 初始化AI助手（如果可用）
        ai_helper = None
        available_fields = []
        if use_ai and HAS_AI_HELPER:
            try:
                ai_helper = TemplateAIHelper()
                available_fields = get_available_fields_for_document(doc_config, FIELD_NAME_TO_CODE)
                print(f"  ✓ AI分析已启用")
            except Exception as e:
                print(f"  ⚠ AI分析不可用: {e}，将使用基础模式")
                ai_helper = None

        # 打开文档
        doc = Document(str(input_path))

        # 统计替换次数
        replacement_count = 0
        ai_replacement_count = 0

        # 处理段落中的占位符
        for para_idx, paragraph in enumerate(doc.paragraphs):
            if not paragraph.text:
                continue

            text = paragraph.text
            original_text = text

            # 首先使用AI分析（如果可用）
            if ai_helper and available_fields:
                try:
                    doc_type = doc_config.get('template_code', '未知')
                    ai_replacements = ai_helper.analyze_paragraph(
                        text,
                        available_fields,
                        doc_type
                    )

                    if ai_replacements:
                        # 应用AI识别的替换
                        text = apply_ai_replacements(text, ai_replacements)
                        if text != original_text:
                            ai_replacement_count += len(ai_replacements)
                except Exception as e:
                    print(f"    ⚠ 段落 {para_idx+1} AI分析失败: {e}")

            # 然后使用规则匹配（作为补充）
            for field_code in doc_config.get('fields', []):
                # 查找字段名称
                for field_name, code in FIELD_NAME_TO_CODE.items():
                    if code == field_code:
                        # 模式1: 字段名称: XXX 或 字段名称: 具体值
                        pattern1 = rf"({re.escape(field_name)}[：:]\s*)([^\n\r{{]+?)(\s|$|\n|\r|，|。)"
                        def replace_func1(match):
                            value = match.group(2).strip()
                            # 如果值不是占位符格式，且不是空值，则替换
                            if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]:
                                return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
                            return match.group(0)
                        text = re.sub(pattern1, replace_func1, text)

                        # 模式2: 直接替换常见的占位符（XXX）
                        pattern2 = rf"({re.escape(field_name)}[：:]\s*)(XXX|xxx|待填|待填写)"
                        text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
                        break

            if text != original_text:
                # 替换整个段落文本
                paragraph.clear()
                paragraph.add_run(text)
                replacement_count += 1

        # 处理表格中的占位符
        for table_idx, table in enumerate(doc.tables):
            for row_idx, row in enumerate(table.rows):
                for col_idx, cell in enumerate(row.cells):
                    for paragraph in cell.paragraphs:
                        if not paragraph.text:
                            continue

                        text = paragraph.text
                        original_text = text

                        # 首先使用AI分析（如果可用）
                        if ai_helper and available_fields:
                            try:
                                doc_type = doc_config.get('template_code', '未知')
                                ai_replacements = ai_helper.analyze_table_cell(
                                    text,
                                    available_fields,
                                    doc_type,
                                    row_idx,
                                    col_idx
                                )

                                if ai_replacements:
                                    # 应用AI识别的替换
                                    text = apply_ai_replacements(text, ai_replacements)
                                    if text != original_text:
                                        ai_replacement_count += len(ai_replacements)
                            except Exception as e:
                                pass  # 静默失败，继续使用规则匹配

                        # 然后使用规则匹配（作为补充）
                        for field_code in doc_config.get('fields', []):
                            for field_name, code in FIELD_NAME_TO_CODE.items():
                                if code == field_code:
                                    # 模式1: 字段名称: XXX 或 字段名称: 具体值
                                    pattern1 = rf"({re.escape(field_name)}[：:]\s*)([^\n\r{{]+?)(\s|$|\n|\r|，|。)"
                                    def replace_func1(match):
                                        value = match.group(2).strip()
                                        if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]:
                                            return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
                                        return match.group(0)
                                    text = re.sub(pattern1, replace_func1, text)

                                    # 模式2: 直接替换常见的占位符（XXX）
                                    pattern2 = rf"({re.escape(field_name)}[：:]\s*)(XXX|xxx|待填|待填写)"
                                    text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
                                    break

                        if text != original_text:
                            paragraph.clear()
                            paragraph.add_run(text)
                            replacement_count += 1

        # 确保输出目录存在
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # 保存文档
        doc.save(str(output_path))

        # 输出统计信息
        if replacement_count > 0 or ai_replacement_count > 0:
            msg = f"  ✓ 处理成功"
            if ai_replacement_count > 0:
                msg += f"，AI识别 {ai_replacement_count} 处"
            if replacement_count > 0:
                msg += f"，规则匹配 {replacement_count} 处"
            print(msg)
        else:
            print(f"  ⚠ 处理完成，但未找到需要替换的内容（可能已包含占位符）")

        return True

    except Exception as e:
        print(f"  ✗ 处理失败: {e}")
        import traceback
        traceback.print_exc()
        return False


def process_all_templates():
    """
    处理所有原始模板文件
    """
    print("="*80)
    print("开始处理原始模板文档")
    print("="*80)

    if not ORIGINAL_TEMPLATES_DIR.exists():
        print(f"错误: 原始模板目录不存在: {ORIGINAL_TEMPLATES_DIR}")
        return

    # 统计信息
    processed_count = 0
    skipped_count = 0
    failed_count = 0

    # 遍历所有文件
    for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
        for file in files:
            # 只处理.doc和.docx文件
            if not file.endswith(('.doc', '.docx')):
                continue

            input_path = Path(root) / file

            # 识别文档类型
            doc_config = identify_document_type(file)

            if not doc_config:
                print(f"\n⚠ 无法识别文档类型: {file}")
                print(f"  路径: {input_path}")
                skipped_count += 1
                continue

            # 生成输出路径（保持相对目录结构）
            relative_path = input_path.relative_to(ORIGINAL_TEMPLATES_DIR)
            output_path = OUTPUT_TEMPLATES_DIR / relative_path.parent / f"{Path(file).stem}.docx"

            print(f"\n处理: {file}")
            print(f"  类型: {doc_config.get('template_code', 'UNKNOWN')}")
            print(f"  输出: {output_path}")

            # 处理文档（使用AI分析）
            if process_document(input_path, output_path, doc_config, use_ai=True):
                processed_count += 1
            else:
                failed_count += 1

    # 输出统计信息
    print("\n" + "="*80)
    print("处理完成")
    print("="*80)
    print(f"成功处理: {processed_count} 个文件")
    print(f"跳过: {skipped_count} 个文件")
    print(f"失败: {failed_count} 个文件")
    print(f"\n处理后的模板保存在: {OUTPUT_TEMPLATES_DIR}")
    print("\n请检查生成的模板文件，确认占位符是否正确添加。")
    print("如有需要，请手动调整占位符位置。")


if __name__ == '__main__':
    process_all_templates()