ai-business-write/update_template_field_relations_from_docx.py

"""
扫描 template_finish/ 目录下的模板文档，分析占位符，更新关联关系
"""
import os
import re
import sys
import pymysql
from pathlib import Path
from docx import Document
from datetime import datetime
from typing import Dict, List, Set, Optional
from dotenv import load_dotenv

# 设置输出编码为UTF-8
if sys.platform == 'win32':
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

load_dotenv()

# 数据库连接配置
DB_CONFIG = {
    'host': os.getenv('DB_HOST', '152.136.177.240'),
    'port': int(os.getenv('DB_PORT', 5012)),
    'user': os.getenv('DB_USER', 'finyx'),
    'password': os.getenv('DB_PASSWORD', '6QsGK6MpePZDE57Z'),
    'database': os.getenv('DB_NAME', 'finyx'),
    'charset': 'utf8mb4'
}

TENANT_ID = 615873064429507639
CREATED_BY = 655162080928945152
UPDATED_BY = 655162080928945152
CURRENT_TIME = datetime.now()

# 模板目录
TEMPLATE_DIR = 'template_finish'


def extract_placeholders_from_docx(file_path: str) -> List[str]:
    """
    从docx文件中提取所有占位符

    Args:
        file_path: docx文件路径

    Returns:
        占位符列表，格式: ['field_code1', 'field_code2', ...]
    """
    placeholders = set()
    pattern = r'\{\{([^}]+)\}\}'  # 匹配 {{field_code}} 格式

    try:
        doc = Document(file_path)

        # 从段落中提取占位符
        for paragraph in doc.paragraphs:
            # 获取段落的所有文本（包括 run 中的文本）
            text = ''.join([run.text for run in paragraph.runs])
            if not text:
                text = paragraph.text

            matches = re.findall(pattern, text)
            for match in matches:
                cleaned = match.strip()
                # 过滤掉不完整的占位符（包含 { 或 } 的）
                if cleaned and '{' not in cleaned and '}' not in cleaned:
                    placeholders.add(cleaned)

        # 从表格中提取占位符
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    # 获取单元格的所有文本（包括 run 中的文本）
                    cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
                    if not cell_text:
                        cell_text = cell.text

                    matches = re.findall(pattern, cell_text)
                    for match in matches:
                        cleaned = match.strip()
                        # 过滤掉不完整的占位符（包含 { 或 } 的）
                        if cleaned and '{' not in cleaned and '}' not in cleaned:
                            placeholders.add(cleaned)

    except Exception as e:
        print(f"  错误: 读取文件失败 - {str(e)}")
        import traceback
        traceback.print_exc()
        return []

    return sorted(list(placeholders))


def get_field_mapping(conn) -> Dict[str, Dict]:
    """
    获取字段映射：filed_code -> field_info
    返回: {filed_code: {id, name, field_type, state}}
    """
    cursor = conn.cursor(pymysql.cursors.DictCursor)

    cursor.execute("""
        SELECT id, name, filed_code, field_type, state
        FROM f_polic_field
        WHERE tenant_id = %s
    """, (TENANT_ID,))

    fields = cursor.fetchall()
    cursor.close()

    # 构建映射：filed_code -> field_info
    field_map = {}
    for field in fields:
        field_code = field['filed_code']
        # 处理 bytes 类型的 state
        state = field['state']
        if isinstance(state, bytes):
            state = int.from_bytes(state, byteorder='big') if len(state) == 1 else 1

        field_map[field_code] = {
            'id': field['id'],
            'name': field['name'],
            'field_type': field['field_type'],
            'state': state
        }

    return field_map


def get_template_mapping(conn) -> Dict[str, int]:
    """
    获取模板映射：template_name -> template_id
    返回: {template_name: template_id}
    """
    cursor = conn.cursor(pymysql.cursors.DictCursor)

    cursor.execute("""
        SELECT id, name
        FROM f_polic_file_config
        WHERE tenant_id = %s
    """, (TENANT_ID,))

    templates = cursor.fetchall()
    cursor.close()

    # 构建映射：name -> id
    template_map = {}
    for template in templates:
        name = template['name']
        template_map[name] = template['id']
        # 也支持带扩展名的名称
        if not name.endswith('.docx'):
            template_map[name + '.docx'] = template['id']

    return template_map


def normalize_template_name(file_name: str) -> str:
    """
    规范化模板名称，用于匹配数据库中的名称
    """
    # 去掉扩展名
    name = file_name.replace('.docx', '')
    return name


def update_template_field_relations(conn, template_id: int, field_codes: List[str], field_map: Dict[str, Dict], dry_run: bool = True):
    """
    更新模板的字段关联关系

    Args:
        conn: 数据库连接
        template_id: 模板ID
        field_codes: 字段编码列表（从模板文档中提取的占位符）
        field_map: 字段映射
        dry_run: 是否只是预览，不实际更新
    """
    cursor = conn.cursor()

    try:
        # 根据字段编码查找字段ID
        field_ids = []
        not_found_codes = []

        for field_code in field_codes:
            if field_code in field_map:
                field_info = field_map[field_code]
                # 只使用启用的字段（state=1）
                if field_info['state'] == 1:
                    field_ids.append(field_info['id'])
            else:
                not_found_codes.append(field_code)

        if not_found_codes:
            print(f"    警告: 以下字段编码在数据库中不存在: {not_found_codes}")

        if not field_ids:
            print(f"    警告: 没有找到有效的字段关联")
            return

        # 获取当前关联关系（包括所有状态的，但只处理 state=1 的）
        cursor.execute("""
            SELECT filed_id, state
            FROM f_polic_file_field
            WHERE tenant_id = %s AND file_id = %s
        """, (TENANT_ID, template_id))
        all_relations = cursor.fetchall()

        # 只统计 state=1 的关联关系
        current_field_ids = set()
        for row in all_relations:
            state = row[1]
            # 处理 bytes 类型的 state
            if isinstance(state, bytes):
                state = int.from_bytes(state, byteorder='big') if len(state) == 1 else 0
            elif state is None:
                state = 0
            else:
                try:
                    state = int(state)
                except:
                    state = 0

            if state == 1:
                current_field_ids.add(row[0])

        print(f"    当前关联关系数: {len(current_field_ids)} (期望: {len(field_ids)})")

        # 计算需要添加和删除的字段
        new_field_ids = set(field_ids)
        to_add = new_field_ids - current_field_ids
        to_remove = current_field_ids - new_field_ids

        if not to_add and not to_remove:
            print(f"    无需更新，关联关系已是最新")
            return

        if dry_run:
            print(f"    [预览] 将添加 {len(to_add)} 个关联，删除 {len(to_remove)} 个关联")
            if to_add:
                print(f"      添加: {sorted(list(to_add))[:5]}{'...' if len(to_add) > 5 else ''}")
            if to_remove:
                print(f"      删除: {sorted(list(to_remove))[:5]}{'...' if len(to_remove) > 5 else ''}")
            return

        # 删除需要移除的关联关系（软删除，设置 state=0）
        if to_remove:
            placeholders = ','.join(['%s'] * len(to_remove))
            cursor.execute(f"""
                UPDATE f_polic_file_field
                SET state = 0, updated_time = %s, updated_by = %s
                WHERE tenant_id = %s AND file_id = %s AND filed_id IN ({placeholders})
            """, [CURRENT_TIME, UPDATED_BY, TENANT_ID, template_id] + list(to_remove))
            print(f"    已删除 {len(to_remove)} 个关联关系")

        # 添加新的关联关系
        if to_add:
            added_count = 0
            updated_count = 0
            for field_id in to_add:
                # 先检查是否已存在（可能是 state=0 的记录）
                cursor.execute("""
                    SELECT id, state FROM f_polic_file_field
                    WHERE tenant_id = %s AND file_id = %s AND filed_id = %s
                """, (TENANT_ID, template_id, field_id))
                existing = cursor.fetchone()

                if existing:
                    # 如果已存在，更新 state=1
                    cursor.execute("""
                        UPDATE f_polic_file_field
                        SET state = 1, updated_time = %s, updated_by = %s
                        WHERE tenant_id = %s AND file_id = %s AND filed_id = %s
                    """, (CURRENT_TIME, UPDATED_BY, TENANT_ID, template_id, field_id))
                    updated_count += 1
                else:
                    # 如果不存在，插入新记录
                    insert_sql = """
                        INSERT INTO f_polic_file_field
                        (tenant_id, file_id, filed_id, created_time, created_by, updated_time, updated_by, state)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, 1)
                    """
                    cursor.execute(insert_sql, (
                        TENANT_ID, template_id, field_id,
                        CURRENT_TIME, CREATED_BY, CURRENT_TIME, CREATED_BY
                    ))
                    added_count += 1

            print(f"    已添加 {added_count} 个新关联，更新 {updated_count} 个现有关联")

        conn.commit()
        print(f"    [OK] 更新成功: 添加 {len(to_add)} 个，删除 {len(to_remove)} 个")

    except Exception as e:
        conn.rollback()
        raise e
    finally:
        cursor.close()


def scan_and_update_templates(dry_run: bool = True):
    """
    扫描模板目录，分析占位符，更新关联关系

    Args:
        dry_run: 是否只是预览，不实际更新
    """
    print("=" * 80)
    print("扫描模板文档并更新关联关系")
    print("=" * 80)
    print(f"模板目录: {TEMPLATE_DIR}")
    print(f"租户ID: {TENANT_ID}")
    print(f"模式: {'预览模式（不会实际更新数据库）' if dry_run else '更新模式（会更新数据库）'}")
    print()

    # 连接数据库
    conn = pymysql.connect(**DB_CONFIG)

    try:
        # 获取字段映射和模板映射
        print("加载数据库数据...")
        field_map = get_field_mapping(conn)
        template_map = get_template_mapping(conn)
        print(f"  字段总数: {len(field_map)}")
        print(f"  模板总数: {len(template_map)}")
        print()

        # 扫描模板目录
        template_path = Path(TEMPLATE_DIR)
        if not template_path.exists():
            print(f"错误: 模板目录不存在: {TEMPLATE_DIR}")
            return

        docx_files = list(template_path.rglob("*.docx"))
        # 过滤掉临时文件
        docx_files = [f for f in docx_files if not f.name.startswith("~$")]

        print(f"找到 {len(docx_files)} 个模板文件")
        print()

        # 统计信息
        processed_count = 0
        updated_count = 0
        not_found_count = 0
        error_count = 0

        # 处理每个模板文件
        for docx_file in sorted(docx_files):
            processed_count += 1
            relative_path = docx_file.relative_to(template_path)
            template_name = normalize_template_name(docx_file.name)

            print(f"[{processed_count}/{len(docx_files)}] {relative_path}")
            print(f"  模板名称: {template_name}")

            # 查找对应的模板ID
            template_id = None
            if template_name in template_map:
                template_id = template_map[template_name]
            elif docx_file.name in template_map:
                template_id = template_map[docx_file.name]
            else:
                print(f"  [ERROR] 未找到对应的模板配置")
                not_found_count += 1
                print()
                continue

            print(f"  模板ID: {template_id}")

            try:
                # 提取占位符
                placeholders = extract_placeholders_from_docx(str(docx_file))
                print(f"  占位符数量: {len(placeholders)}")

                if not placeholders:
                    print(f"  [WARN] 未找到占位符")
                    print()
                    continue

                # 分类输入字段和输出字段
                input_fields = []
                output_fields = []
                not_found_fields = []

                for placeholder in placeholders:
                    if placeholder in field_map:
                        field_info = field_map[placeholder]
                        if field_info['field_type'] == 1:
                            input_fields.append(placeholder)
                        elif field_info['field_type'] == 2:
                            output_fields.append(placeholder)
                    else:
                        not_found_fields.append(placeholder)

                # 所有模板都应该关联两个输入字段（即使模板文档中没有占位符）
                # clue_info (线索信息) 和 target_basic_info_clue (被核查人员工作基本情况线索)
                required_input_fields = ['clue_info', 'target_basic_info_clue']
                for req_field in required_input_fields:
                    if req_field in field_map:
                        field_info = field_map[req_field]
                        # 只添加启用的字段
                        if field_info['state'] == 1 and req_field not in input_fields:
                            input_fields.append(req_field)

                print(f"  输入字段: {len(input_fields)} (包含必需字段), 输出字段: {len(output_fields)}")
                if input_fields:
                    print(f"    输入字段编码: {input_fields}")
                if output_fields:
                    print(f"    输出字段编码: {output_fields[:10]}{'...' if len(output_fields) > 10 else ''}")
                if not_found_fields:
                    print(f"  未找到的字段编码: {not_found_fields[:5]}{'...' if len(not_found_fields) > 5 else ''}")

                # 合并所有字段编码（输入字段 + 输出字段）
                all_field_codes = input_fields + output_fields

                # 更新关联关系
                print(f"  更新关联关系...")
                update_template_field_relations(
                    conn, template_id, all_field_codes, field_map, dry_run=dry_run
                )

                updated_count += 1
                print()

            except Exception as e:
                print(f"  [ERROR] 处理失败: {str(e)}")
                import traceback
                traceback.print_exc()
                error_count += 1
                print()

        # 输出统计信息
        print("=" * 80)
        print("处理完成")
        print("=" * 80)
        print(f"总文件数: {len(docx_files)}")
        print(f"处理成功: {updated_count}")
        print(f"未找到模板: {not_found_count}")
        print(f"处理失败: {error_count}")

        if dry_run:
            print()
            print("注意: 这是预览模式，未实际更新数据库")
            print("要实际更新，请运行: python update_template_field_relations_from_docx.py --update")

    finally:
        conn.close()


if __name__ == '__main__':
    import sys

    # 检查是否有 --update 参数
    dry_run = '--update' not in sys.argv

    scan_and_update_templates(dry_run=dry_run)