ai-business-write/rescan_and_update_templates.py

"""
重新扫描模板占位符并更新数据库
1. 扫描所有本地模板文件（包括新转换的.docx文件）
2. 提取所有占位符
3. 检查数据库中的模板记录
4. 更新数据库（如有变化）
"""
import os
import pymysql
from pathlib import Path
from typing import Dict, List, Set, Tuple
from dotenv import load_dotenv
import re
from docx import Document

# 加载环境变量
load_dotenv()

# 数据库配置
DB_CONFIG = {
    'host': os.getenv('DB_HOST', '152.136.177.240'),
    'port': int(os.getenv('DB_PORT', 5012)),
    'user': os.getenv('DB_USER', 'finyx'),
    'password': os.getenv('DB_PASSWORD', '6QsGK6MpePZDE57Z'),
    'database': os.getenv('DB_NAME', 'finyx'),
    'charset': 'utf8mb4'
}

CREATED_BY = 655162080928945152
UPDATED_BY = 655162080928945152

# 项目根目录
PROJECT_ROOT = Path(__file__).parent
TEMPLATES_DIR = PROJECT_ROOT / "template_finish"


def print_section(title):
    """打印章节标题"""
    print("\n" + "="*70)
    print(f"  {title}")
    print("="*70)


def print_result(success, message):
    """打印结果"""
    status = "[OK]" if success else "[FAIL]"
    print(f"{status} {message}")


def generate_id():
    """生成ID"""
    import time
    return int(time.time() * 1000000)


def scan_local_templates(base_dir: Path) -> Dict[str, Path]:
    """扫描本地模板文件"""
    templates = {}
    if not base_dir.exists():
        return templates

    for file_path in base_dir.rglob('*'):
        if file_path.is_file():
            # 只处理文档文件（优先处理.docx，也包含.doc和.wps用于检查）
            if file_path.suffix.lower() in ['.doc', '.docx', '.wps']:
                relative_path = file_path.relative_to(PROJECT_ROOT)
                relative_path_str = str(relative_path).replace('\\', '/')
                templates[relative_path_str] = file_path

    return templates


def get_actual_tenant_id(conn) -> int:
    """获取数据库中的实际tenant_id"""
    cursor = conn.cursor(pymysql.cursors.DictCursor)
    try:
        cursor.execute("SELECT DISTINCT tenant_id FROM f_polic_file_config LIMIT 1")
        result = cursor.fetchone()
        if result:
            return result['tenant_id']
        return 1  # 默认值
    finally:
        cursor.close()


def get_db_templates(conn, tenant_id: int) -> Dict[str, Dict]:
    """从数据库获取所有模板配置"""
    cursor = conn.cursor(pymysql.cursors.DictCursor)
    try:
        sql = """
            SELECT id, name, file_path, state, parent_id
            FROM f_polic_file_config
            WHERE tenant_id = %s
        """
        cursor.execute(sql, (tenant_id,))
        templates = cursor.fetchall()

        result = {}
        for template in templates:
            file_path = template['file_path']
            if file_path:
                result[file_path] = {
                    'id': template['id'],
                    'name': template['name'],
                    'file_path': file_path,
                    'state': template['state'],
                    'parent_id': template['parent_id']
                }
        return result
    finally:
        cursor.close()


def extract_placeholders_from_docx(file_path: Path) -> Tuple[Set[str], bool]:
    """
    从docx文件中提取所有占位符

    Returns:
        (占位符集合, 是否成功读取)
    """
    placeholders = set()
    placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
    success = False

    try:
        doc = Document(file_path)
        success = True

        # 从段落中提取占位符
        for paragraph in doc.paragraphs:
            text = paragraph.text
            matches = placeholder_pattern.findall(text)
            for match in matches:
                field_code = match.strip()
                if field_code:
                    placeholders.add(field_code)

        # 从表格中提取占位符
        for table in doc.tables:
            try:
                for row in table.rows:
                    for cell in row.cells:
                        for paragraph in cell.paragraphs:
                            text = paragraph.text
                            matches = placeholder_pattern.findall(text)
                            for match in matches:
                                field_code = match.strip()
                                if field_code:
                                    placeholders.add(field_code)
            except Exception as e:
                # 某些表格结构可能导致错误，跳过
                continue
    except Exception as e:
        # 文件读取失败（可能是.doc格式或其他问题）
        return placeholders, False

    return placeholders, success


def scan_all_templates_placeholders(local_templates: Dict[str, Path]) -> Dict[str, Tuple[Set[str], bool, str]]:
    """
    扫描所有模板的占位符

    Returns:
        字典，key为相对路径，value为(占位符集合, 是否成功读取, 文件扩展名)
    """
    results = {}

    for rel_path, file_path in local_templates.items():
        file_ext = file_path.suffix.lower()
        placeholders, success = extract_placeholders_from_docx(file_path)
        results[rel_path] = (placeholders, success, file_ext)

    return results


def update_or_create_template(conn, tenant_id: int, rel_path: str, file_path: Path, db_templates: Dict[str, Dict]):
    """更新或创建模板记录"""
    cursor = conn.cursor()

    try:
        # 检查是否已存在
        if rel_path in db_templates:
            # 已存在，检查是否需要更新
            template_id = db_templates[rel_path]['id']
            # 这里可以添加更新逻辑，比如更新名称等
            return template_id, 'exists'
        else:
            # 不存在，创建新记录
            template_id = generate_id()
            file_name = file_path.stem  # 不含扩展名的文件名

            cursor.execute("""
                INSERT INTO f_polic_file_config
                (id, tenant_id, parent_id, name, input_data, file_path, created_time, created_by, updated_time, updated_by, state)
                VALUES (%s, %s, %s, %s, %s, %s, NOW(), %s, NOW(), %s, 1)
            """, (
                template_id,
                tenant_id,
                None,  # parent_id
                file_name,
                '{}',  # input_data
                rel_path,
                CREATED_BY,
                UPDATED_BY
            ))
            conn.commit()
            return template_id, 'created'
    except Exception as e:
        conn.rollback()
        raise e
    finally:
        cursor.close()


def main():
    """主函数"""
    print_section("重新扫描模板占位符并更新数据库")

    # 1. 扫描本地模板
    print_section("1. 扫描本地模板文件")
    local_templates = scan_local_templates(TEMPLATES_DIR)
    print_result(True, f"找到 {len(local_templates)} 个本地模板文件")

    # 统计文件类型
    file_types = {}
    for file_path in local_templates.values():
        ext = file_path.suffix.lower()
        file_types[ext] = file_types.get(ext, 0) + 1

    print("\n文件类型统计:")
    for ext, count in sorted(file_types.items()):
        print(f"  {ext}: {count} 个")

    if not local_templates:
        print_result(False, "未找到本地模板文件")
        return

    # 2. 连接数据库
    print_section("2. 连接数据库")
    try:
        conn = pymysql.connect(**DB_CONFIG)
        print_result(True, "数据库连接成功")
    except Exception as e:
        print_result(False, f"数据库连接失败: {str(e)}")
        return

    try:
        # 3. 获取实际的tenant_id
        print_section("3. 获取实际的tenant_id")
        tenant_id = get_actual_tenant_id(conn)
        print_result(True, f"实际tenant_id: {tenant_id}")

        # 4. 获取数据库中的模板
        print_section("4. 获取数据库中的模板配置")
        db_templates = get_db_templates(conn, tenant_id)
        print_result(True, f"找到 {len(db_templates)} 条数据库模板记录（有file_path的）")

        # 5. 扫描所有模板的占位符
        print_section("5. 扫描所有模板的占位符")
        print("  正在扫描，请稍候...")

        template_placeholders = scan_all_templates_placeholders(local_templates)

        # 统计结果
        all_placeholders = set()
        templates_with_placeholders = 0
        templates_without_placeholders = 0
        templates_read_success = 0
        templates_read_failed = 0
        doc_files = []
        docx_files = []

        for rel_path, (placeholders, success, file_ext) in template_placeholders.items():
            all_placeholders.update(placeholders)

            if success:
                templates_read_success += 1
                if placeholders:
                    templates_with_placeholders += 1
                else:
                    templates_without_placeholders += 1
            else:
                templates_read_failed += 1
                if file_ext == '.doc':
                    doc_files.append(rel_path)

            if file_ext == '.docx':
                docx_files.append(rel_path)
            elif file_ext == '.doc':
                doc_files.append(rel_path)

        print(f"\n扫描结果统计:")
        print(f"  - 成功读取: {templates_read_success} 个")
        print(f"  - 读取失败: {templates_read_failed} 个")
        print(f"  - 有占位符: {templates_with_placeholders} 个")
        print(f"  - 无占位符: {templates_without_placeholders} 个")
        print(f"  - 发现的占位符总数: {len(all_placeholders)} 个不同的占位符")

        if doc_files:
            print(f"\n  [注意] 发现 {len(doc_files)} 个.doc文件（可能无法读取）:")
            for doc_file in doc_files[:5]:
                print(f"    - {doc_file}")
            if len(doc_files) > 5:
                print(f"    ... 还有 {len(doc_files) - 5} 个")

        print(f"\n  .docx文件: {len(docx_files)} 个")

        # 6. 显示所有占位符
        print_section("6. 所有占位符列表")
        if all_placeholders:
            for placeholder in sorted(all_placeholders):
                print(f"    - {placeholder}")
        else:
            print("    未发现占位符")

        # 7. 检查并更新数据库
        print_section("7. 检查并更新数据库")

        missing_templates = []
        for rel_path in local_templates.keys():
            if rel_path not in db_templates:
                missing_templates.append(rel_path)

        if missing_templates:
            print(f"  发现 {len(missing_templates)} 个缺失的模板记录")
            created_count = 0
            for rel_path in missing_templates:
                file_path = local_templates[rel_path]
                try:
                    template_id, status = update_or_create_template(conn, tenant_id, rel_path, file_path, db_templates)
                    if status == 'created':
                        print(f"  [创建] ID={template_id}, 路径={rel_path}")
                        created_count += 1
                except Exception as e:
                    print(f"  [错误] 创建失败: {rel_path}, 错误: {str(e)}")

            if created_count > 0:
                print_result(True, f"成功创建 {created_count} 条模板记录")
        else:
            print_result(True, "所有本地模板都已存在于数据库中")

        # 8. 检查文件格式变化（.doc -> .docx）
        print_section("8. 检查文件格式变化")

        # 检查数据库中是否有.doc路径，但本地已经是.docx
        format_changes = []
        for db_path, db_info in db_templates.items():
            if db_path.endswith('.doc'):
                # 检查是否有对应的.docx文件
                docx_path = db_path.replace('.doc', '.docx')
                if docx_path in local_templates:
                    format_changes.append((db_path, docx_path, db_info))

        if format_changes:
            print(f"  发现 {len(format_changes)} 个文件格式变化（.doc -> .docx）")
            updated_count = 0
            for old_path, new_path, db_info in format_changes:
                try:
                    cursor = conn.cursor()
                    cursor.execute("""
                        UPDATE f_polic_file_config
                        SET file_path = %s
                        WHERE id = %s
                    """, (new_path, db_info['id']))
                    conn.commit()
                    cursor.close()
                    print(f"  [更新] ID={db_info['id']}, 名称={db_info['name']}")
                    print(f"    旧路径: {old_path}")
                    print(f"    新路径: {new_path}")
                    updated_count += 1
                except Exception as e:
                    print(f"  [错误] 更新失败: {str(e)}")

            if updated_count > 0:
                print_result(True, f"成功更新 {updated_count} 条路径记录")
        else:
            print_result(True, "未发现文件格式变化")

        # 9. 生成详细报告
        print_section("9. 详细报告")

        # 找出有占位符的模板示例
        templates_with_placeholders_list = []
        for rel_path, (placeholders, success, file_ext) in template_placeholders.items():
            if success and placeholders and file_ext == '.docx':
                templates_with_placeholders_list.append((rel_path, placeholders))

        if templates_with_placeholders_list:
            print(f"\n  有占位符的模板示例（前5个）:")
            for i, (rel_path, placeholders) in enumerate(templates_with_placeholders_list[:5], 1):
                print(f"\n  {i}. {Path(rel_path).name}")
                print(f"     路径: {rel_path}")
                print(f"     占位符数量: {len(placeholders)}")
                print(f"     占位符: {sorted(placeholders)}")

    finally:
        conn.close()
        print_result(True, "数据库连接已关闭")

    print_section("完成")


if __name__ == "__main__":
    main()