""" 清理 f_polic_file_config 表中的重复和无效数据 确保文档模板结构和 template_finish/ 文件夹对应 """ import os import re import json import pymysql from pathlib import Path from typing import Dict, List, Set, Optional from collections import defaultdict # 数据库连接配置 DB_CONFIG = { 'host': '152.136.177.240', 'port': 5012, 'user': 'finyx', 'password': '6QsGK6MpePZDE57Z', 'database': 'finyx', 'charset': 'utf8mb4' } TENANT_ID = 615873064429507639 UPDATED_BY = 655162080928945152 TEMPLATE_BASE_DIR = 'template_finish' def normalize_template_name(name: str) -> str: """ 标准化模板名称(去掉扩展名、括号内容、数字前缀等) Args: name: 文件名或模板名称 Returns: 标准化后的名称 """ # 去掉扩展名 name = Path(name).stem if '.' in name else name # 去掉括号内容 name = re.sub(r'[((].*?[))]', '', name) name = name.strip() # 去掉数字前缀和点号 name = re.sub(r'^\d+[\.\-]?\s*', '', name) name = name.strip() return name def scan_template_files(base_dir: str) -> Dict[str, Dict]: """ 扫描模板文件夹,获取所有有效的模板文件 Returns: 字典,key为标准化名称,value为模板信息列表(可能有多个同名文件) """ base_path = Path(base_dir) if not base_path.exists(): print(f"错误: 目录不存在 - {base_dir}") return {} templates = defaultdict(list) print("=" * 80) print("扫描模板文件...") print("=" * 80) for docx_file in sorted(base_path.rglob("*.docx")): # 跳过临时文件 if docx_file.name.startswith("~$"): continue relative_path = docx_file.relative_to(base_path) file_name = docx_file.name normalized_name = normalize_template_name(file_name) templates[normalized_name].append({ 'file_path': str(docx_file), 'relative_path': str(relative_path), 'file_name': file_name, 'normalized_name': normalized_name }) print(f"总共扫描到 {sum(len(v) for v in templates.values())} 个模板文件") print(f"唯一模板名称: {len(templates)} 个") return dict(templates) def get_all_templates_from_db(conn) -> Dict[str, List[Dict]]: """ 从数据库获取所有模板,按标准化名称分组 Returns: 字典,key为标准化名称,value为模板记录列表 """ cursor = conn.cursor(pymysql.cursors.DictCursor) sql = """ SELECT id, name, file_path, parent_id, state, input_data, created_time, updated_time FROM f_polic_file_config WHERE tenant_id = %s ORDER BY created_time DESC """ cursor.execute(sql, (TENANT_ID,)) templates = cursor.fetchall() result = defaultdict(list) for template in templates: normalized_name = normalize_template_name(template['name']) result[normalized_name].append({ 'id': template['id'], 'name': template['name'], 'normalized_name': normalized_name, 'file_path': template['file_path'], 'parent_id': template['parent_id'], 'state': template['state'], 'input_data': template['input_data'], 'created_time': template['created_time'], 'updated_time': template['updated_time'] }) cursor.close() return dict(result) def find_duplicates(db_templates: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]: """ 找出重复的模板(同一标准化名称有多个记录) Returns: 字典,key为标准化名称,value为重复的模板记录列表 """ duplicates = {} for normalized_name, templates in db_templates.items(): if len(templates) > 1: duplicates[normalized_name] = templates return duplicates def select_best_template(templates: List[Dict], valid_template_files: List[Dict]) -> Optional[Dict]: """ 从多个重复的模板中选择最好的一个(保留最新的、有效的) Args: templates: 数据库中的模板记录列表 valid_template_files: 有效的模板文件列表 Returns: 应该保留的模板记录,或None """ if not templates: return None # 优先选择:state=1 且 file_path 有效的 enabled_templates = [t for t in templates if t.get('state') == 1] if enabled_templates: # 如果有多个启用的,选择最新的 enabled_templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True) return enabled_templates[0] # 如果没有启用的,选择最新的 templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True) return templates[0] def delete_template_and_relations(conn, template_id: int): """ 删除模板及其关联关系 Args: conn: 数据库连接 template_id: 模板ID """ cursor = conn.cursor() try: # 删除字段关联 delete_relations_sql = """ DELETE FROM f_polic_file_field WHERE tenant_id = %s AND file_id = %s """ cursor.execute(delete_relations_sql, (TENANT_ID, template_id)) relations_deleted = cursor.rowcount # 删除模板配置 delete_template_sql = """ DELETE FROM f_polic_file_config WHERE tenant_id = %s AND id = %s """ cursor.execute(delete_template_sql, (TENANT_ID, template_id)) template_deleted = cursor.rowcount conn.commit() return relations_deleted, template_deleted except Exception as e: conn.rollback() raise Exception(f"删除模板失败: {str(e)}") finally: cursor.close() def mark_invalid_templates(conn, valid_template_names: Set[str]): """ 标记无效的模板(不在template_finish文件夹中的模板) Args: conn: 数据库连接 valid_template_names: 有效的模板名称集合(标准化后的) """ cursor = conn.cursor() try: # 获取所有模板 sql = """ SELECT id, name FROM f_polic_file_config WHERE tenant_id = %s """ cursor.execute(sql, (TENANT_ID,)) all_templates = cursor.fetchall() invalid_count = 0 for template in all_templates: template_id = template[0] template_name = template[1] normalized_name = normalize_template_name(template_name) # 检查是否在有效模板列表中 if normalized_name not in valid_template_names: # 标记为未启用 update_sql = """ UPDATE f_polic_file_config SET state = 0, updated_time = NOW(), updated_by = %s WHERE id = %s AND tenant_id = %s """ cursor.execute(update_sql, (UPDATED_BY, template_id, TENANT_ID)) invalid_count += 1 print(f" [WARN] 标记无效模板: {template_name} (ID: {template_id})") conn.commit() print(f"\n总共标记 {invalid_count} 个无效模板") except Exception as e: conn.rollback() raise Exception(f"标记无效模板失败: {str(e)}") finally: cursor.close() def main(): """主函数""" print("=" * 80) print("清理重复和无效的模板数据") print("=" * 80) print() try: # 连接数据库 print("1. 连接数据库...") conn = pymysql.connect(**DB_CONFIG) print("[OK] 数据库连接成功\n") # 扫描模板文件 print("2. 扫描模板文件...") valid_templates = scan_template_files(TEMPLATE_BASE_DIR) valid_template_names = set(valid_templates.keys()) print(f"[OK] 找到 {len(valid_template_names)} 个有效模板名称\n") # 获取数据库中的模板 print("3. 获取数据库中的模板...") db_templates = get_all_templates_from_db(conn) print(f"[OK] 数据库中有 {sum(len(v) for v in db_templates.values())} 个模板记录") print(f"[OK] 唯一模板名称: {len(db_templates)} 个\n") # 找出重复的模板 print("4. 查找重复的模板...") duplicates = find_duplicates(db_templates) print(f"[OK] 找到 {len(duplicates)} 个重复的模板名称\n") # 处理重复模板 print("5. 处理重复模板...") print("=" * 80) total_deleted = 0 total_relations_deleted = 0 for normalized_name, templates in duplicates.items(): print(f"\n处理重复模板: {normalized_name}") print(f" 重复记录数: {len(templates)}") # 获取对应的有效模板文件 valid_files = valid_templates.get(normalized_name, []) # 选择要保留的模板 keep_template = select_best_template(templates, valid_files) if keep_template: print(f" [KEEP] 保留模板: {keep_template['name']} (ID: {keep_template['id']})") # 删除其他重复的模板 for template in templates: if template['id'] != keep_template['id']: print(f" [DELETE] 删除重复模板: {template['name']} (ID: {template['id']})") relations_deleted, template_deleted = delete_template_and_relations(conn, template['id']) total_relations_deleted += relations_deleted total_deleted += template_deleted else: print(f" [WARN] 无法确定要保留的模板,跳过") print(f"\n[OK] 删除重复模板: {total_deleted} 个") print(f"[OK] 删除关联关系: {total_relations_deleted} 条\n") # 标记无效模板 print("6. 标记无效模板...") mark_invalid_templates(conn, valid_template_names) # 统计最终结果 print("\n7. 统计最终结果...") final_templates = get_all_templates_from_db(conn) enabled_count = sum(1 for templates in final_templates.values() for t in templates if t.get('state') == 1) disabled_count = sum(1 for templates in final_templates.values() for t in templates if t.get('state') != 1) print(f"[OK] 最终模板总数: {sum(len(v) for v in final_templates.values())}") print(f"[OK] 启用模板数: {enabled_count}") print(f"[OK] 禁用模板数: {disabled_count}") print(f"[OK] 唯一模板名称: {len(final_templates)}") # 打印最终模板列表 print("\n8. 最终模板列表(启用的):") print("=" * 80) for normalized_name, templates in sorted(final_templates.items()): enabled = [t for t in templates if t.get('state') == 1] if enabled: for template in enabled: print(f" - {template['name']} (ID: {template['id']})") print("\n" + "=" * 80) print("清理完成!") print("=" * 80) except Exception as e: print(f"\n[ERROR] 发生错误: {e}") import traceback traceback.print_exc() if 'conn' in locals(): conn.rollback() finally: if 'conn' in locals(): conn.close() print("\n数据库连接已关闭") if __name__ == '__main__': main()