ai-business-write/cleanup_duplicate_templates.py

362 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
清理 f_polic_file_config 表中的重复和无效数据
确保文档模板结构和 template_finish/ 文件夹对应
"""
import os
import re
import json
import pymysql
from pathlib import Path
from typing import Dict, List, Set, Optional
from collections import defaultdict
# 数据库连接配置
DB_CONFIG = {
'host': '152.136.177.240',
'port': 5012,
'user': 'finyx',
'password': '6QsGK6MpePZDE57Z',
'database': 'finyx',
'charset': 'utf8mb4'
}
TENANT_ID = 615873064429507639
UPDATED_BY = 655162080928945152
TEMPLATE_BASE_DIR = 'template_finish'
def normalize_template_name(name: str) -> str:
"""
标准化模板名称(去掉扩展名、括号内容、数字前缀等)
Args:
name: 文件名或模板名称
Returns:
标准化后的名称
"""
# 去掉扩展名
name = Path(name).stem if '.' in name else name
# 去掉括号内容
name = re.sub(r'[(].*?[)]', '', name)
name = name.strip()
# 去掉数字前缀和点号
name = re.sub(r'^\d+[\.\-]?\s*', '', name)
name = name.strip()
return name
def scan_template_files(base_dir: str) -> Dict[str, Dict]:
"""
扫描模板文件夹,获取所有有效的模板文件
Returns:
字典key为标准化名称value为模板信息列表可能有多个同名文件
"""
base_path = Path(base_dir)
if not base_path.exists():
print(f"错误: 目录不存在 - {base_dir}")
return {}
templates = defaultdict(list)
print("=" * 80)
print("扫描模板文件...")
print("=" * 80)
for docx_file in sorted(base_path.rglob("*.docx")):
# 跳过临时文件
if docx_file.name.startswith("~$"):
continue
relative_path = docx_file.relative_to(base_path)
file_name = docx_file.name
normalized_name = normalize_template_name(file_name)
templates[normalized_name].append({
'file_path': str(docx_file),
'relative_path': str(relative_path),
'file_name': file_name,
'normalized_name': normalized_name
})
print(f"总共扫描到 {sum(len(v) for v in templates.values())} 个模板文件")
print(f"唯一模板名称: {len(templates)}")
return dict(templates)
def get_all_templates_from_db(conn) -> Dict[str, List[Dict]]:
"""
从数据库获取所有模板,按标准化名称分组
Returns:
字典key为标准化名称value为模板记录列表
"""
cursor = conn.cursor(pymysql.cursors.DictCursor)
sql = """
SELECT id, name, file_path, parent_id, state, input_data, created_time, updated_time
FROM f_polic_file_config
WHERE tenant_id = %s
ORDER BY created_time DESC
"""
cursor.execute(sql, (TENANT_ID,))
templates = cursor.fetchall()
result = defaultdict(list)
for template in templates:
normalized_name = normalize_template_name(template['name'])
result[normalized_name].append({
'id': template['id'],
'name': template['name'],
'normalized_name': normalized_name,
'file_path': template['file_path'],
'parent_id': template['parent_id'],
'state': template['state'],
'input_data': template['input_data'],
'created_time': template['created_time'],
'updated_time': template['updated_time']
})
cursor.close()
return dict(result)
def find_duplicates(db_templates: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
"""
找出重复的模板(同一标准化名称有多个记录)
Returns:
字典key为标准化名称value为重复的模板记录列表
"""
duplicates = {}
for normalized_name, templates in db_templates.items():
if len(templates) > 1:
duplicates[normalized_name] = templates
return duplicates
def select_best_template(templates: List[Dict], valid_template_files: List[Dict]) -> Optional[Dict]:
"""
从多个重复的模板中选择最好的一个(保留最新的、有效的)
Args:
templates: 数据库中的模板记录列表
valid_template_files: 有效的模板文件列表
Returns:
应该保留的模板记录或None
"""
if not templates:
return None
# 优先选择state=1 且 file_path 有效的
enabled_templates = [t for t in templates if t.get('state') == 1]
if enabled_templates:
# 如果有多个启用的,选择最新的
enabled_templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True)
return enabled_templates[0]
# 如果没有启用的,选择最新的
templates.sort(key=lambda x: x.get('updated_time') or x.get('created_time'), reverse=True)
return templates[0]
def delete_template_and_relations(conn, template_id: int):
"""
删除模板及其关联关系
Args:
conn: 数据库连接
template_id: 模板ID
"""
cursor = conn.cursor()
try:
# 删除字段关联
delete_relations_sql = """
DELETE FROM f_polic_file_field
WHERE tenant_id = %s AND file_id = %s
"""
cursor.execute(delete_relations_sql, (TENANT_ID, template_id))
relations_deleted = cursor.rowcount
# 删除模板配置
delete_template_sql = """
DELETE FROM f_polic_file_config
WHERE tenant_id = %s AND id = %s
"""
cursor.execute(delete_template_sql, (TENANT_ID, template_id))
template_deleted = cursor.rowcount
conn.commit()
return relations_deleted, template_deleted
except Exception as e:
conn.rollback()
raise Exception(f"删除模板失败: {str(e)}")
finally:
cursor.close()
def mark_invalid_templates(conn, valid_template_names: Set[str]):
"""
标记无效的模板不在template_finish文件夹中的模板
Args:
conn: 数据库连接
valid_template_names: 有效的模板名称集合(标准化后的)
"""
cursor = conn.cursor()
try:
# 获取所有模板
sql = """
SELECT id, name FROM f_polic_file_config
WHERE tenant_id = %s
"""
cursor.execute(sql, (TENANT_ID,))
all_templates = cursor.fetchall()
invalid_count = 0
for template in all_templates:
template_id = template[0]
template_name = template[1]
normalized_name = normalize_template_name(template_name)
# 检查是否在有效模板列表中
if normalized_name not in valid_template_names:
# 标记为未启用
update_sql = """
UPDATE f_polic_file_config
SET state = 0, updated_time = NOW(), updated_by = %s
WHERE id = %s AND tenant_id = %s
"""
cursor.execute(update_sql, (UPDATED_BY, template_id, TENANT_ID))
invalid_count += 1
print(f" [WARN] 标记无效模板: {template_name} (ID: {template_id})")
conn.commit()
print(f"\n总共标记 {invalid_count} 个无效模板")
except Exception as e:
conn.rollback()
raise Exception(f"标记无效模板失败: {str(e)}")
finally:
cursor.close()
def main():
"""主函数"""
print("=" * 80)
print("清理重复和无效的模板数据")
print("=" * 80)
print()
try:
# 连接数据库
print("1. 连接数据库...")
conn = pymysql.connect(**DB_CONFIG)
print("[OK] 数据库连接成功\n")
# 扫描模板文件
print("2. 扫描模板文件...")
valid_templates = scan_template_files(TEMPLATE_BASE_DIR)
valid_template_names = set(valid_templates.keys())
print(f"[OK] 找到 {len(valid_template_names)} 个有效模板名称\n")
# 获取数据库中的模板
print("3. 获取数据库中的模板...")
db_templates = get_all_templates_from_db(conn)
print(f"[OK] 数据库中有 {sum(len(v) for v in db_templates.values())} 个模板记录")
print(f"[OK] 唯一模板名称: {len(db_templates)}\n")
# 找出重复的模板
print("4. 查找重复的模板...")
duplicates = find_duplicates(db_templates)
print(f"[OK] 找到 {len(duplicates)} 个重复的模板名称\n")
# 处理重复模板
print("5. 处理重复模板...")
print("=" * 80)
total_deleted = 0
total_relations_deleted = 0
for normalized_name, templates in duplicates.items():
print(f"\n处理重复模板: {normalized_name}")
print(f" 重复记录数: {len(templates)}")
# 获取对应的有效模板文件
valid_files = valid_templates.get(normalized_name, [])
# 选择要保留的模板
keep_template = select_best_template(templates, valid_files)
if keep_template:
print(f" [KEEP] 保留模板: {keep_template['name']} (ID: {keep_template['id']})")
# 删除其他重复的模板
for template in templates:
if template['id'] != keep_template['id']:
print(f" [DELETE] 删除重复模板: {template['name']} (ID: {template['id']})")
relations_deleted, template_deleted = delete_template_and_relations(conn, template['id'])
total_relations_deleted += relations_deleted
total_deleted += template_deleted
else:
print(f" [WARN] 无法确定要保留的模板,跳过")
print(f"\n[OK] 删除重复模板: {total_deleted}")
print(f"[OK] 删除关联关系: {total_relations_deleted}\n")
# 标记无效模板
print("6. 标记无效模板...")
mark_invalid_templates(conn, valid_template_names)
# 统计最终结果
print("\n7. 统计最终结果...")
final_templates = get_all_templates_from_db(conn)
enabled_count = sum(1 for templates in final_templates.values()
for t in templates if t.get('state') == 1)
disabled_count = sum(1 for templates in final_templates.values()
for t in templates if t.get('state') != 1)
print(f"[OK] 最终模板总数: {sum(len(v) for v in final_templates.values())}")
print(f"[OK] 启用模板数: {enabled_count}")
print(f"[OK] 禁用模板数: {disabled_count}")
print(f"[OK] 唯一模板名称: {len(final_templates)}")
# 打印最终模板列表
print("\n8. 最终模板列表(启用的):")
print("=" * 80)
for normalized_name, templates in sorted(final_templates.items()):
enabled = [t for t in templates if t.get('state') == 1]
if enabled:
for template in enabled:
print(f" - {template['name']} (ID: {template['id']})")
print("\n" + "=" * 80)
print("清理完成!")
print("=" * 80)
except Exception as e:
print(f"\n[ERROR] 发生错误: {e}")
import traceback
traceback.print_exc()
if 'conn' in locals():
conn.rollback()
finally:
if 'conn' in locals():
conn.close()
print("\n数据库连接已关闭")
if __name__ == '__main__':
main()