362 lines
12 KiB
Python
362 lines
12 KiB
Python
"""
|
||
清理 f_polic_file_config 表中的重复和无效数据
|
||
确保文档模板结构和 template_finish/ 文件夹对应
|
||
"""
|
||
import os
|
||
import re
|
||
import json
|
||
import pymysql
|
||
from pathlib import Path
|
||
from typing import Dict, List, Set, Optional
|
||
from collections import defaultdict
|
||
|
||
# 数据库连接配置
|
||
DB_CONFIG = {
|
||
'host': '152.136.177.240',
|
||
'port': 5012,
|
||
'user': 'finyx',
|
||
'password': '6QsGK6MpePZDE57Z',
|
||
'database': 'finyx',
|
||
'charset': 'utf8mb4'
|
||
}
|
||
|
||
TENANT_ID = 615873064429507639
|
||
UPDATED_BY = 655162080928945152
|
||
TEMPLATE_BASE_DIR = 'template_finish'
|
||
|
||
|
||
def normalize_template_name(name: str) -> str:
|
||
"""
|
||
标准化模板名称(去掉扩展名、括号内容、数字前缀等)
|
||
|
||
Args:
|
||
name: 文件名或模板名称
|
||
|
||
Returns:
|
||
标准化后的名称
|
||
"""
|
||
# 去掉扩展名
|
||
name = Path(name).stem if '.' in name else name
|
||
|
||
# 去掉括号内容
|
||
name = re.sub(r'[((].*?[))]', '', name)
|
||
name = name.strip()
|
||
|
||
# 去掉数字前缀和点号
|
||
name = re.sub(r'^\d+[\.\-]?\s*', '', name)
|
||
name = name.strip()
|
||
|
||
return name
|
||
|
||
|
||
def scan_template_files(base_dir: str) -> Dict[str, Dict]:
|
||
"""
|
||
扫描模板文件夹,获取所有有效的模板文件
|
||
|
||
Returns:
|
||
字典,key为标准化名称,value为模板信息列表(可能有多个同名文件)
|
||
"""
|
||
base_path = Path(base_dir)
|
||
if not base_path.exists():
|
||
print(f"错误: 目录不存在 - {base_dir}")
|
||
return {}
|
||
|
||
templates = defaultdict(list)
|
||
|
||
print("=" * 80)
|
||
print("扫描模板文件...")
|
||
print("=" * 80)
|
||
|
||
for docx_file in sorted(base_path.rglob("*.docx")):
|
||
# 跳过临时文件
|
||
if docx_file.name.startswith("~$"):
|
||
continue
|
||
|
||
relative_path = docx_file.relative_to(base_path)
|
||
file_name = docx_file.name
|
||
normalized_name = normalize_template_name(file_name)
|
||
|
||
templates[normalized_name].append({
|
||
'file_path': str(docx_file),
|
||
'relative_path': str(relative_path),
|
||
'file_name': file_name,
|
||
'normalized_name': normalized_name
|
||
})
|
||
|
||
print(f"总共扫描到 {sum(len(v) for v in templates.values())} 个模板文件")
|
||
print(f"唯一模板名称: {len(templates)} 个")
|
||
|
||
return dict(templates)
|
||
|
||
|
||
def get_all_templates_from_db(conn) -> Dict[str, List[Dict]]:
|
||
"""
|
||
从数据库获取所有模板,按标准化名称分组
|
||
|
||
Returns:
|
||
字典,key为标准化名称,value为模板记录列表
|
||
"""
|
||
cursor = conn.cursor(pymysql.cursors.DictCursor)
|
||
|
||
sql = """
|
||
SELECT id, name, file_path, parent_id, state, input_data, created_time, updated_time
|
||
FROM f_polic_file_config
|
||
WHERE tenant_id = %s
|
||
ORDER BY created_time DESC
|
||
"""
|
||
cursor.execute(sql, (TENANT_ID,))
|
||
templates = cursor.fetchall()
|
||
|
||
result = defaultdict(list)
|
||
for template in templates:
|
||
normalized_name = normalize_template_name(template['name'])
|
||
result[normalized_name].append({
|
||
'id': template['id'],
|
||
'name': template['name'],
|
||
'normalized_name': normalized_name,
|
||
'file_path': template['file_path'],
|
||
'parent_id': template['parent_id'],
|
||
'state': template['state'],
|
||
'input_data': template['input_data'],
|
||
'created_time': template['created_time'],
|
||
'updated_time': template['updated_time']
|
||
})
|
||
|
||
cursor.close()
|
||
return dict(result)
|
||
|
||
|
||
def find_duplicates(db_templates: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
|
||
"""
|
||
找出重复的模板(同一标准化名称有多个记录)
|
||
|
||
Returns:
|
||
字典,key为标准化名称,value为重复的模板记录列表
|
||
"""
|
||
duplicates = {}
|
||
for normalized_name, templates in db_templates.items():
|
||
if len(templates) > 1:
|
||
duplicates[normalized_name] = templates
|
||
return duplicates
|
||
|
||
|
||
def select_best_template(templates: List[Dict], valid_template_files: List[Dict]) -> Optional[Dict]:
|
||
"""
|
||
从多个重复的模板中选择最好的一个(保留最新的、有效的)
|
||
|
||
Args:
|
||
templates: 数据库中的模板记录列表
|
||
valid_template_files: 有效的模板文件列表
|
||
|
||
Returns:
|
||
应该保留的模板记录,或None
|
||
"""
|
||
if not templates:
|
||
return None
|
||
|
||
# 优先选择:state=1 且 file_path 有效的
|
||
enabled_templates = [t for t in templates if t.get('state') == 1]
|
||
|
||
if enabled_templates:
|
||
# 如果有多个启用的,选择最新的
|
||
enabled_templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True)
|
||
return enabled_templates[0]
|
||
|
||
# 如果没有启用的,选择最新的
|
||
templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True)
|
||
return templates[0]
|
||
|
||
|
||
def delete_template_and_relations(conn, template_id: int):
|
||
"""
|
||
删除模板及其关联关系
|
||
|
||
Args:
|
||
conn: 数据库连接
|
||
template_id: 模板ID
|
||
"""
|
||
cursor = conn.cursor()
|
||
|
||
try:
|
||
# 删除字段关联
|
||
delete_relations_sql = """
|
||
DELETE FROM f_polic_file_field
|
||
WHERE tenant_id = %s AND file_id = %s
|
||
"""
|
||
cursor.execute(delete_relations_sql, (TENANT_ID, template_id))
|
||
relations_deleted = cursor.rowcount
|
||
|
||
# 删除模板配置
|
||
delete_template_sql = """
|
||
DELETE FROM f_polic_file_config
|
||
WHERE tenant_id = %s AND id = %s
|
||
"""
|
||
cursor.execute(delete_template_sql, (TENANT_ID, template_id))
|
||
template_deleted = cursor.rowcount
|
||
|
||
conn.commit()
|
||
return relations_deleted, template_deleted
|
||
|
||
except Exception as e:
|
||
conn.rollback()
|
||
raise Exception(f"删除模板失败: {str(e)}")
|
||
finally:
|
||
cursor.close()
|
||
|
||
|
||
def mark_invalid_templates(conn, valid_template_names: Set[str]):
|
||
"""
|
||
标记无效的模板(不在template_finish文件夹中的模板)
|
||
|
||
Args:
|
||
conn: 数据库连接
|
||
valid_template_names: 有效的模板名称集合(标准化后的)
|
||
"""
|
||
cursor = conn.cursor()
|
||
|
||
try:
|
||
# 获取所有模板
|
||
sql = """
|
||
SELECT id, name FROM f_polic_file_config
|
||
WHERE tenant_id = %s
|
||
"""
|
||
cursor.execute(sql, (TENANT_ID,))
|
||
all_templates = cursor.fetchall()
|
||
|
||
invalid_count = 0
|
||
for template in all_templates:
|
||
template_id = template[0]
|
||
template_name = template[1]
|
||
normalized_name = normalize_template_name(template_name)
|
||
|
||
# 检查是否在有效模板列表中
|
||
if normalized_name not in valid_template_names:
|
||
# 标记为未启用
|
||
update_sql = """
|
||
UPDATE f_polic_file_config
|
||
SET state = 0, updated_time = NOW(), updated_by = %s
|
||
WHERE id = %s AND tenant_id = %s
|
||
"""
|
||
cursor.execute(update_sql, (UPDATED_BY, template_id, TENANT_ID))
|
||
invalid_count += 1
|
||
print(f" [WARN] 标记无效模板: {template_name} (ID: {template_id})")
|
||
|
||
conn.commit()
|
||
print(f"\n总共标记 {invalid_count} 个无效模板")
|
||
|
||
except Exception as e:
|
||
conn.rollback()
|
||
raise Exception(f"标记无效模板失败: {str(e)}")
|
||
finally:
|
||
cursor.close()
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("=" * 80)
|
||
print("清理重复和无效的模板数据")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
try:
|
||
# 连接数据库
|
||
print("1. 连接数据库...")
|
||
conn = pymysql.connect(**DB_CONFIG)
|
||
print("[OK] 数据库连接成功\n")
|
||
|
||
# 扫描模板文件
|
||
print("2. 扫描模板文件...")
|
||
valid_templates = scan_template_files(TEMPLATE_BASE_DIR)
|
||
valid_template_names = set(valid_templates.keys())
|
||
print(f"[OK] 找到 {len(valid_template_names)} 个有效模板名称\n")
|
||
|
||
# 获取数据库中的模板
|
||
print("3. 获取数据库中的模板...")
|
||
db_templates = get_all_templates_from_db(conn)
|
||
print(f"[OK] 数据库中有 {sum(len(v) for v in db_templates.values())} 个模板记录")
|
||
print(f"[OK] 唯一模板名称: {len(db_templates)} 个\n")
|
||
|
||
# 找出重复的模板
|
||
print("4. 查找重复的模板...")
|
||
duplicates = find_duplicates(db_templates)
|
||
print(f"[OK] 找到 {len(duplicates)} 个重复的模板名称\n")
|
||
|
||
# 处理重复模板
|
||
print("5. 处理重复模板...")
|
||
print("=" * 80)
|
||
|
||
total_deleted = 0
|
||
total_relations_deleted = 0
|
||
|
||
for normalized_name, templates in duplicates.items():
|
||
print(f"\n处理重复模板: {normalized_name}")
|
||
print(f" 重复记录数: {len(templates)}")
|
||
|
||
# 获取对应的有效模板文件
|
||
valid_files = valid_templates.get(normalized_name, [])
|
||
|
||
# 选择要保留的模板
|
||
keep_template = select_best_template(templates, valid_files)
|
||
|
||
if keep_template:
|
||
print(f" [KEEP] 保留模板: {keep_template['name']} (ID: {keep_template['id']})")
|
||
|
||
# 删除其他重复的模板
|
||
for template in templates:
|
||
if template['id'] != keep_template['id']:
|
||
print(f" [DELETE] 删除重复模板: {template['name']} (ID: {template['id']})")
|
||
relations_deleted, template_deleted = delete_template_and_relations(conn, template['id'])
|
||
total_relations_deleted += relations_deleted
|
||
total_deleted += template_deleted
|
||
else:
|
||
print(f" [WARN] 无法确定要保留的模板,跳过")
|
||
|
||
print(f"\n[OK] 删除重复模板: {total_deleted} 个")
|
||
print(f"[OK] 删除关联关系: {total_relations_deleted} 条\n")
|
||
|
||
# 标记无效模板
|
||
print("6. 标记无效模板...")
|
||
mark_invalid_templates(conn, valid_template_names)
|
||
|
||
# 统计最终结果
|
||
print("\n7. 统计最终结果...")
|
||
final_templates = get_all_templates_from_db(conn)
|
||
enabled_count = sum(1 for templates in final_templates.values()
|
||
for t in templates if t.get('state') == 1)
|
||
disabled_count = sum(1 for templates in final_templates.values()
|
||
for t in templates if t.get('state') != 1)
|
||
|
||
print(f"[OK] 最终模板总数: {sum(len(v) for v in final_templates.values())}")
|
||
print(f"[OK] 启用模板数: {enabled_count}")
|
||
print(f"[OK] 禁用模板数: {disabled_count}")
|
||
print(f"[OK] 唯一模板名称: {len(final_templates)}")
|
||
|
||
# 打印最终模板列表
|
||
print("\n8. 最终模板列表(启用的):")
|
||
print("=" * 80)
|
||
for normalized_name, templates in sorted(final_templates.items()):
|
||
enabled = [t for t in templates if t.get('state') == 1]
|
||
if enabled:
|
||
for template in enabled:
|
||
print(f" - {template['name']} (ID: {template['id']})")
|
||
|
||
print("\n" + "=" * 80)
|
||
print("清理完成!")
|
||
print("=" * 80)
|
||
|
||
except Exception as e:
|
||
print(f"\n[ERROR] 发生错误: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
if 'conn' in locals():
|
||
conn.rollback()
|
||
finally:
|
||
if 'conn' in locals():
|
||
conn.close()
|
||
print("\n数据库连接已关闭")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
|