"""
清理 f_polic_file_config 表中的重复和无效数据
确保文档模板结构和 template_finish/ 文件夹对应
"""
import os
import re
import json
import pymysql
from pathlib import Path
from typing import Dict, List, Set, Optional
from collections import defaultdict

# 数据库连接配置
DB_CONFIG = {
    'host': '152.136.177.240',
    'port': 5012,
    'user': 'finyx',
    'password': '6QsGK6MpePZDE57Z',
    'database': 'finyx',
    'charset': 'utf8mb4'
}

TENANT_ID = 615873064429507639
UPDATED_BY = 655162080928945152
TEMPLATE_BASE_DIR = 'template_finish'


def normalize_template_name(name: str) -> str:
    """
    标准化模板名称（去掉扩展名、括号内容、数字前缀等）
    
    Args:
        name: 文件名或模板名称
        
    Returns:
        标准化后的名称
    """
    # 去掉扩展名
    name = Path(name).stem if '.' in name else name
    
    # 去掉括号内容
    name = re.sub(r'[（(].*?[）)]', '', name)
    name = name.strip()
    
    # 去掉数字前缀和点号
    name = re.sub(r'^\d+[\.\-]?\s*', '', name)
    name = name.strip()
    
    return name


def scan_template_files(base_dir: str) -> Dict[str, Dict]:
    """
    扫描模板文件夹，获取所有有效的模板文件
    
    Returns:
        字典，key为标准化名称，value为模板信息列表（可能有多个同名文件）
    """
    base_path = Path(base_dir)
    if not base_path.exists():
        print(f"错误: 目录不存在 - {base_dir}")
        return {}
    
    templates = defaultdict(list)
    
    print("=" * 80)
    print("扫描模板文件...")
    print("=" * 80)
    
    for docx_file in sorted(base_path.rglob("*.docx")):
        # 跳过临时文件
        if docx_file.name.startswith("~$"):
            continue
        
        relative_path = docx_file.relative_to(base_path)
        file_name = docx_file.name
        normalized_name = normalize_template_name(file_name)
        
        templates[normalized_name].append({
            'file_path': str(docx_file),
            'relative_path': str(relative_path),
            'file_name': file_name,
            'normalized_name': normalized_name
        })
    
    print(f"总共扫描到 {sum(len(v) for v in templates.values())} 个模板文件")
    print(f"唯一模板名称: {len(templates)} 个")
    
    return dict(templates)


def get_all_templates_from_db(conn) -> Dict[str, List[Dict]]:
    """
    从数据库获取所有模板，按标准化名称分组
    
    Returns:
        字典，key为标准化名称，value为模板记录列表
    """
    cursor = conn.cursor(pymysql.cursors.DictCursor)
    
    sql = """
        SELECT id, name, file_path, parent_id, state, input_data, created_time, updated_time
        FROM f_polic_file_config
        WHERE tenant_id = %s
        ORDER BY created_time DESC
    """
    cursor.execute(sql, (TENANT_ID,))
    templates = cursor.fetchall()
    
    result = defaultdict(list)
    for template in templates:
        normalized_name = normalize_template_name(template['name'])
        result[normalized_name].append({
            'id': template['id'],
            'name': template['name'],
            'normalized_name': normalized_name,
            'file_path': template['file_path'],
            'parent_id': template['parent_id'],
            'state': template['state'],
            'input_data': template['input_data'],
            'created_time': template['created_time'],
            'updated_time': template['updated_time']
        })
    
    cursor.close()
    return dict(result)


def find_duplicates(db_templates: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
    """
    找出重复的模板（同一标准化名称有多个记录）
    
    Returns:
        字典，key为标准化名称，value为重复的模板记录列表
    """
    duplicates = {}
    for normalized_name, templates in db_templates.items():
        if len(templates) > 1:
            duplicates[normalized_name] = templates
    return duplicates


def select_best_template(templates: List[Dict], valid_template_files: List[Dict]) -> Optional[Dict]:
    """
    从多个重复的模板中选择最好的一个（保留最新的、有效的）
    
    Args:
        templates: 数据库中的模板记录列表
        valid_template_files: 有效的模板文件列表
        
    Returns:
        应该保留的模板记录，或None
    """
    if not templates:
        return None
    
    # 优先选择：state=1 且 file_path 有效的
    enabled_templates = [t for t in templates if t.get('state') == 1]
    
    if enabled_templates:
        # 如果有多个启用的，选择最新的
        enabled_templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True)
        return enabled_templates[0]
    
    # 如果没有启用的，选择最新的
    templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True)
    return templates[0]


def delete_template_and_relations(conn, template_id: int):
    """
    删除模板及其关联关系
    
    Args:
        conn: 数据库连接
        template_id: 模板ID
    """
    cursor = conn.cursor()
    
    try:
        # 删除字段关联
        delete_relations_sql = """
            DELETE FROM f_polic_file_field
            WHERE tenant_id = %s AND file_id = %s
        """
        cursor.execute(delete_relations_sql, (TENANT_ID, template_id))
        relations_deleted = cursor.rowcount
        
        # 删除模板配置
        delete_template_sql = """
            DELETE FROM f_polic_file_config
            WHERE tenant_id = %s AND id = %s
        """
        cursor.execute(delete_template_sql, (TENANT_ID, template_id))
        template_deleted = cursor.rowcount
        
        conn.commit()
        return relations_deleted, template_deleted
        
    except Exception as e:
        conn.rollback()
        raise Exception(f"删除模板失败: {str(e)}")
    finally:
        cursor.close()


def mark_invalid_templates(conn, valid_template_names: Set[str]):
    """
    标记无效的模板（不在template_finish文件夹中的模板）
    
    Args:
        conn: 数据库连接
        valid_template_names: 有效的模板名称集合（标准化后的）
    """
    cursor = conn.cursor()
    
    try:
        # 获取所有模板
        sql = """
            SELECT id, name FROM f_polic_file_config
            WHERE tenant_id = %s
        """
        cursor.execute(sql, (TENANT_ID,))
        all_templates = cursor.fetchall()
        
        invalid_count = 0
        for template in all_templates:
            template_id = template[0]
            template_name = template[1]
            normalized_name = normalize_template_name(template_name)
            
            # 检查是否在有效模板列表中
            if normalized_name not in valid_template_names:
                # 标记为未启用
                update_sql = """
                    UPDATE f_polic_file_config
                    SET state = 0, updated_time = NOW(), updated_by = %s
                    WHERE id = %s AND tenant_id = %s
                """
                cursor.execute(update_sql, (UPDATED_BY, template_id, TENANT_ID))
                invalid_count += 1
                print(f"  [WARN] 标记无效模板: {template_name} (ID: {template_id})")
        
        conn.commit()
        print(f"\n总共标记 {invalid_count} 个无效模板")
        
    except Exception as e:
        conn.rollback()
        raise Exception(f"标记无效模板失败: {str(e)}")
    finally:
        cursor.close()


def main():
    """主函数"""
    print("=" * 80)
    print("清理重复和无效的模板数据")
    print("=" * 80)
    print()
    
    try:
        # 连接数据库
        print("1. 连接数据库...")
        conn = pymysql.connect(**DB_CONFIG)
        print("[OK] 数据库连接成功\n")
        
        # 扫描模板文件
        print("2. 扫描模板文件...")
        valid_templates = scan_template_files(TEMPLATE_BASE_DIR)
        valid_template_names = set(valid_templates.keys())
        print(f"[OK] 找到 {len(valid_template_names)} 个有效模板名称\n")
        
        # 获取数据库中的模板
        print("3. 获取数据库中的模板...")
        db_templates = get_all_templates_from_db(conn)
        print(f"[OK] 数据库中有 {sum(len(v) for v in db_templates.values())} 个模板记录")
        print(f"[OK] 唯一模板名称: {len(db_templates)} 个\n")
        
        # 找出重复的模板
        print("4. 查找重复的模板...")
        duplicates = find_duplicates(db_templates)
        print(f"[OK] 找到 {len(duplicates)} 个重复的模板名称\n")
        
        # 处理重复模板
        print("5. 处理重复模板...")
        print("=" * 80)
        
        total_deleted = 0
        total_relations_deleted = 0
        
        for normalized_name, templates in duplicates.items():
            print(f"\n处理重复模板: {normalized_name}")
            print(f"  重复记录数: {len(templates)}")
            
            # 获取对应的有效模板文件
            valid_files = valid_templates.get(normalized_name, [])
            
            # 选择要保留的模板
            keep_template = select_best_template(templates, valid_files)
            
            if keep_template:
                print(f"  [KEEP] 保留模板: {keep_template['name']} (ID: {keep_template['id']})")
                
                # 删除其他重复的模板
                for template in templates:
                    if template['id'] != keep_template['id']:
                        print(f"  [DELETE] 删除重复模板: {template['name']} (ID: {template['id']})")
                        relations_deleted, template_deleted = delete_template_and_relations(conn, template['id'])
                        total_relations_deleted += relations_deleted
                        total_deleted += template_deleted
            else:
                print(f"  [WARN] 无法确定要保留的模板，跳过")
        
        print(f"\n[OK] 删除重复模板: {total_deleted} 个")
        print(f"[OK] 删除关联关系: {total_relations_deleted} 条\n")
        
        # 标记无效模板
        print("6. 标记无效模板...")
        mark_invalid_templates(conn, valid_template_names)
        
        # 统计最终结果
        print("\n7. 统计最终结果...")
        final_templates = get_all_templates_from_db(conn)
        enabled_count = sum(1 for templates in final_templates.values() 
                           for t in templates if t.get('state') == 1)
        disabled_count = sum(1 for templates in final_templates.values() 
                            for t in templates if t.get('state') != 1)
        
        print(f"[OK] 最终模板总数: {sum(len(v) for v in final_templates.values())}")
        print(f"[OK] 启用模板数: {enabled_count}")
        print(f"[OK] 禁用模板数: {disabled_count}")
        print(f"[OK] 唯一模板名称: {len(final_templates)}")
        
        # 打印最终模板列表
        print("\n8. 最终模板列表（启用的）:")
        print("=" * 80)
        for normalized_name, templates in sorted(final_templates.items()):
            enabled = [t for t in templates if t.get('state') == 1]
            if enabled:
                for template in enabled:
                    print(f"  - {template['name']} (ID: {template['id']})")
        
        print("\n" + "=" * 80)
        print("清理完成！")
        print("=" * 80)
        
    except Exception as e:
        print(f"\n[ERROR] 发生错误: {e}")
        import traceback
        traceback.print_exc()
        if 'conn' in locals():
            conn.rollback()
    finally:
        if 'conn' in locals():
            conn.close()
            print("\n数据库连接已关闭")


if __name__ == '__main__':
    main()