""" 重新扫描模板占位符并更新数据库 1. 扫描所有本地模板文件(包括新转换的.docx文件) 2. 提取所有占位符 3. 检查数据库中的模板记录 4. 更新数据库(如有变化) """ import os import pymysql from pathlib import Path from typing import Dict, List, Set, Tuple from dotenv import load_dotenv import re from docx import Document # 加载环境变量 load_dotenv() # 数据库配置 DB_CONFIG = { 'host': os.getenv('DB_HOST', '152.136.177.240'), 'port': int(os.getenv('DB_PORT', 5012)), 'user': os.getenv('DB_USER', 'finyx'), 'password': os.getenv('DB_PASSWORD', '6QsGK6MpePZDE57Z'), 'database': os.getenv('DB_NAME', 'finyx'), 'charset': 'utf8mb4' } CREATED_BY = 655162080928945152 UPDATED_BY = 655162080928945152 # 项目根目录 PROJECT_ROOT = Path(__file__).parent TEMPLATES_DIR = PROJECT_ROOT / "template_finish" def print_section(title): """打印章节标题""" print("\n" + "="*70) print(f" {title}") print("="*70) def print_result(success, message): """打印结果""" status = "[OK]" if success else "[FAIL]" print(f"{status} {message}") def generate_id(): """生成ID""" import time return int(time.time() * 1000000) def scan_local_templates(base_dir: Path) -> Dict[str, Path]: """扫描本地模板文件""" templates = {} if not base_dir.exists(): return templates for file_path in base_dir.rglob('*'): if file_path.is_file(): # 只处理文档文件(优先处理.docx,也包含.doc和.wps用于检查) if file_path.suffix.lower() in ['.doc', '.docx', '.wps']: relative_path = file_path.relative_to(PROJECT_ROOT) relative_path_str = str(relative_path).replace('\\', '/') templates[relative_path_str] = file_path return templates def get_actual_tenant_id(conn) -> int: """获取数据库中的实际tenant_id""" cursor = conn.cursor(pymysql.cursors.DictCursor) try: cursor.execute("SELECT DISTINCT tenant_id FROM f_polic_file_config LIMIT 1") result = cursor.fetchone() if result: return result['tenant_id'] return 1 # 默认值 finally: cursor.close() def get_db_templates(conn, tenant_id: int) -> Dict[str, Dict]: """从数据库获取所有模板配置""" cursor = conn.cursor(pymysql.cursors.DictCursor) try: sql = """ SELECT id, name, file_path, state, parent_id FROM f_polic_file_config WHERE tenant_id = %s """ cursor.execute(sql, (tenant_id,)) templates = cursor.fetchall() result = {} for template in templates: file_path = template['file_path'] if file_path: result[file_path] = { 'id': template['id'], 'name': template['name'], 'file_path': file_path, 'state': template['state'], 'parent_id': template['parent_id'] } return result finally: cursor.close() def extract_placeholders_from_docx(file_path: Path) -> Tuple[Set[str], bool]: """ 从docx文件中提取所有占位符 Returns: (占位符集合, 是否成功读取) """ placeholders = set() placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}') success = False try: doc = Document(file_path) success = True # 从段落中提取占位符 for paragraph in doc.paragraphs: text = paragraph.text matches = placeholder_pattern.findall(text) for match in matches: field_code = match.strip() if field_code: placeholders.add(field_code) # 从表格中提取占位符 for table in doc.tables: try: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: text = paragraph.text matches = placeholder_pattern.findall(text) for match in matches: field_code = match.strip() if field_code: placeholders.add(field_code) except Exception as e: # 某些表格结构可能导致错误,跳过 continue except Exception as e: # 文件读取失败(可能是.doc格式或其他问题) return placeholders, False return placeholders, success def scan_all_templates_placeholders(local_templates: Dict[str, Path]) -> Dict[str, Tuple[Set[str], bool, str]]: """ 扫描所有模板的占位符 Returns: 字典,key为相对路径,value为(占位符集合, 是否成功读取, 文件扩展名) """ results = {} for rel_path, file_path in local_templates.items(): file_ext = file_path.suffix.lower() placeholders, success = extract_placeholders_from_docx(file_path) results[rel_path] = (placeholders, success, file_ext) return results def update_or_create_template(conn, tenant_id: int, rel_path: str, file_path: Path, db_templates: Dict[str, Dict]): """更新或创建模板记录""" cursor = conn.cursor() try: # 检查是否已存在 if rel_path in db_templates: # 已存在,检查是否需要更新 template_id = db_templates[rel_path]['id'] # 这里可以添加更新逻辑,比如更新名称等 return template_id, 'exists' else: # 不存在,创建新记录 template_id = generate_id() file_name = file_path.stem # 不含扩展名的文件名 cursor.execute(""" INSERT INTO f_polic_file_config (id, tenant_id, parent_id, name, input_data, file_path, created_time, created_by, updated_time, updated_by, state) VALUES (%s, %s, %s, %s, %s, %s, NOW(), %s, NOW(), %s, 1) """, ( template_id, tenant_id, None, # parent_id file_name, '{}', # input_data rel_path, CREATED_BY, UPDATED_BY )) conn.commit() return template_id, 'created' except Exception as e: conn.rollback() raise e finally: cursor.close() def main(): """主函数""" print_section("重新扫描模板占位符并更新数据库") # 1. 扫描本地模板 print_section("1. 扫描本地模板文件") local_templates = scan_local_templates(TEMPLATES_DIR) print_result(True, f"找到 {len(local_templates)} 个本地模板文件") # 统计文件类型 file_types = {} for file_path in local_templates.values(): ext = file_path.suffix.lower() file_types[ext] = file_types.get(ext, 0) + 1 print("\n文件类型统计:") for ext, count in sorted(file_types.items()): print(f" {ext}: {count} 个") if not local_templates: print_result(False, "未找到本地模板文件") return # 2. 连接数据库 print_section("2. 连接数据库") try: conn = pymysql.connect(**DB_CONFIG) print_result(True, "数据库连接成功") except Exception as e: print_result(False, f"数据库连接失败: {str(e)}") return try: # 3. 获取实际的tenant_id print_section("3. 获取实际的tenant_id") tenant_id = get_actual_tenant_id(conn) print_result(True, f"实际tenant_id: {tenant_id}") # 4. 获取数据库中的模板 print_section("4. 获取数据库中的模板配置") db_templates = get_db_templates(conn, tenant_id) print_result(True, f"找到 {len(db_templates)} 条数据库模板记录(有file_path的)") # 5. 扫描所有模板的占位符 print_section("5. 扫描所有模板的占位符") print(" 正在扫描,请稍候...") template_placeholders = scan_all_templates_placeholders(local_templates) # 统计结果 all_placeholders = set() templates_with_placeholders = 0 templates_without_placeholders = 0 templates_read_success = 0 templates_read_failed = 0 doc_files = [] docx_files = [] for rel_path, (placeholders, success, file_ext) in template_placeholders.items(): all_placeholders.update(placeholders) if success: templates_read_success += 1 if placeholders: templates_with_placeholders += 1 else: templates_without_placeholders += 1 else: templates_read_failed += 1 if file_ext == '.doc': doc_files.append(rel_path) if file_ext == '.docx': docx_files.append(rel_path) elif file_ext == '.doc': doc_files.append(rel_path) print(f"\n扫描结果统计:") print(f" - 成功读取: {templates_read_success} 个") print(f" - 读取失败: {templates_read_failed} 个") print(f" - 有占位符: {templates_with_placeholders} 个") print(f" - 无占位符: {templates_without_placeholders} 个") print(f" - 发现的占位符总数: {len(all_placeholders)} 个不同的占位符") if doc_files: print(f"\n [注意] 发现 {len(doc_files)} 个.doc文件(可能无法读取):") for doc_file in doc_files[:5]: print(f" - {doc_file}") if len(doc_files) > 5: print(f" ... 还有 {len(doc_files) - 5} 个") print(f"\n .docx文件: {len(docx_files)} 个") # 6. 显示所有占位符 print_section("6. 所有占位符列表") if all_placeholders: for placeholder in sorted(all_placeholders): print(f" - {placeholder}") else: print(" 未发现占位符") # 7. 检查并更新数据库 print_section("7. 检查并更新数据库") missing_templates = [] for rel_path in local_templates.keys(): if rel_path not in db_templates: missing_templates.append(rel_path) if missing_templates: print(f" 发现 {len(missing_templates)} 个缺失的模板记录") created_count = 0 for rel_path in missing_templates: file_path = local_templates[rel_path] try: template_id, status = update_or_create_template(conn, tenant_id, rel_path, file_path, db_templates) if status == 'created': print(f" [创建] ID={template_id}, 路径={rel_path}") created_count += 1 except Exception as e: print(f" [错误] 创建失败: {rel_path}, 错误: {str(e)}") if created_count > 0: print_result(True, f"成功创建 {created_count} 条模板记录") else: print_result(True, "所有本地模板都已存在于数据库中") # 8. 检查文件格式变化(.doc -> .docx) print_section("8. 检查文件格式变化") # 检查数据库中是否有.doc路径,但本地已经是.docx format_changes = [] for db_path, db_info in db_templates.items(): if db_path.endswith('.doc'): # 检查是否有对应的.docx文件 docx_path = db_path.replace('.doc', '.docx') if docx_path in local_templates: format_changes.append((db_path, docx_path, db_info)) if format_changes: print(f" 发现 {len(format_changes)} 个文件格式变化(.doc -> .docx)") updated_count = 0 for old_path, new_path, db_info in format_changes: try: cursor = conn.cursor() cursor.execute(""" UPDATE f_polic_file_config SET file_path = %s WHERE id = %s """, (new_path, db_info['id'])) conn.commit() cursor.close() print(f" [更新] ID={db_info['id']}, 名称={db_info['name']}") print(f" 旧路径: {old_path}") print(f" 新路径: {new_path}") updated_count += 1 except Exception as e: print(f" [错误] 更新失败: {str(e)}") if updated_count > 0: print_result(True, f"成功更新 {updated_count} 条路径记录") else: print_result(True, "未发现文件格式变化") # 9. 生成详细报告 print_section("9. 详细报告") # 找出有占位符的模板示例 templates_with_placeholders_list = [] for rel_path, (placeholders, success, file_ext) in template_placeholders.items(): if success and placeholders and file_ext == '.docx': templates_with_placeholders_list.append((rel_path, placeholders)) if templates_with_placeholders_list: print(f"\n 有占位符的模板示例(前5个):") for i, (rel_path, placeholders) in enumerate(templates_with_placeholders_list[:5], 1): print(f"\n {i}. {Path(rel_path).name}") print(f" 路径: {rel_path}") print(f" 占位符数量: {len(placeholders)}") print(f" 占位符: {sorted(placeholders)}") finally: conn.close() print_result(True, "数据库连接已关闭") print_section("完成") if __name__ == "__main__": main()