""" 批量将template_finish文件夹下的模板文件注册到数据库并上传到MinIO """ import os import re import json import pymysql from minio import Minio from minio.error import S3Error from datetime import datetime from pathlib import Path from docx import Document from typing import Dict, List, Optional # MinIO连接配置 MINIO_CONFIG = { 'endpoint': 'minio.datacubeworld.com:9000', 'access_key': 'JOLXFXny3avFSzB0uRA5', 'secret_key': 'G1BR8jStNfovkfH5ou39EmPl34E4l7dGrnd3Cz0I', 'secure': True } # 数据库连接配置 DB_CONFIG = { 'host': '152.136.177.240', 'port': 5012, 'user': 'finyx', 'password': '6QsGK6MpePZDE57Z', 'database': 'finyx', 'charset': 'utf8mb4' } # 固定值 TENANT_ID = 615873064429507639 CREATED_BY = 655162080928945152 UPDATED_BY = 655162080928945152 CURRENT_TIME = datetime.now() BUCKET_NAME = 'finyx' def generate_id(): """生成ID""" import time import random timestamp = int(time.time() * 1000) random_part = random.randint(100000, 999999) return timestamp * 1000 + random_part def extract_placeholders_from_docx(file_path): """从docx文件中提取所有占位符""" placeholders = set() pattern = r'\{\{([^}]+)\}\}' try: doc = Document(file_path) # 从段落中提取占位符 for paragraph in doc.paragraphs: text = paragraph.text matches = re.findall(pattern, text) for match in matches: placeholders.add(match.strip()) # 从表格中提取占位符 for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: text = paragraph.text matches = re.findall(pattern, text) for match in matches: placeholders.add(match.strip()) except Exception as e: print(f" 错误: 读取文件失败 - {str(e)}") return [] return sorted(list(placeholders)) def generate_template_code(file_name: str, relative_path: str) -> str: """ 根据文件名和路径生成模板编码 例如: - "2.初步核实审批表(XXX).docx" -> "PRELIMINARY_VERIFICATION_APPROVAL" - "1.请示报告卡(XXX).docx" -> "REQUEST_REPORT_CARD" """ # 提取基础名称(去掉扩展名和括号内容) base_name = Path(file_name).stem base_name = re.sub(r'(.*?)', '', base_name) # 去掉括号内容 base_name = re.sub(r'\(.*?\)', '', base_name) # 去掉英文括号内容 base_name = base_name.strip().rstrip('(').rstrip('(') # 去掉数字前缀 base_name = re.sub(r'^\d+[\.\-]?', '', base_name).strip() # 生成编码:转换为大写,中文字符映射 code_mapping = { '请示报告卡': 'REQUEST_REPORT_CARD', '初步核实审批表': 'PRELIMINARY_VERIFICATION_APPROVAL', '附件初核方案': 'PRELIMINARY_VERIFICATION_PLAN', '谈话通知书': 'INTERVIEW_NOTICE', '谈话审批表': 'INTERVIEW_APPROVAL', '谈话前安全风险评估表': 'PRE_INTERVIEW_RISK_ASSESSMENT', '谈话方案': 'INTERVIEW_PLAN', '谈话后安全风险评估表': 'POST_INTERVIEW_RISK_ASSESSMENT', '谈话笔录': 'INTERVIEW_RECORD', '谈话询问对象情况摸底调查30问': 'INTERVIEW_OBJECT_INVESTIGATION', '被谈话人权利义务告知书': 'INTERVIEWEE_RIGHTS_OBLIGATIONS_NOTICE', '点对点交接单': 'POINT_TO_POINT_HANDOVER', '陪送交接单': 'ESCORT_HANDOVER', '保密承诺书': 'CONFIDENTIALITY_COMMITMENT', '办案人员-办案安全保密承诺书': 'CASE_OFFICER_SECURITY_COMMITMENT', '请示报告卡(初核谈话)': 'REQUEST_REPORT_CARD_INTERVIEW', '请示报告卡(初核报告结论)': 'REQUEST_REPORT_CARD_CONCLUSION', 'XXX初核情况报告': 'PRELIMINARY_VERIFICATION_REPORT' } # 查找映射 for key, code in code_mapping.items(): if key in base_name: # 如果是谈话通知书,可能需要区分第几联 if '谈话通知书' in base_name: if '第一联' in base_name: return 'INTERVIEW_NOTICE_FIRST' elif '第二联' in base_name: return 'INTERVIEW_NOTICE_SECOND' elif '第三联' in base_name: return 'INTERVIEW_NOTICE_THIRD' # 如果是保密承诺书,区分是否党员 if '保密承诺书' in base_name: if '非中共党员' in base_name or '非党员' in base_name: return 'CONFIDENTIALITY_COMMITMENT_NON_PARTY' elif '中共党员' in base_name or '党员' in base_name: return 'CONFIDENTIALITY_COMMITMENT_PARTY' return code # 如果没有匹配,使用通用规则生成 # 将中文转换为拼音首字母(简化处理,实际应使用pypinyin) # 这里先使用简化规则 code = base_name.upper() code = re.sub(r'[^\w]', '_', code) code = re.sub(r'_+', '_', code).strip('_') return code if code else f'TEMPLATE_{generate_id() % 1000000}' def upload_to_minio(client: Minio, file_path: str, template_name: str) -> str: """上传文件到MinIO""" try: now = datetime.now() object_name = f'{TENANT_ID}/TEMPLATE/{now.year}/{now.month:02d}/{template_name}' client.fput_object( BUCKET_NAME, object_name, file_path, content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) return f"/{object_name}" except Exception as e: raise Exception(f"上传到MinIO失败: {str(e)}") def register_template_to_db(conn, template_info: Dict) -> int: """注册模板到数据库""" cursor = conn.cursor() try: # 检查是否已存在 check_sql = """ SELECT id FROM f_polic_file_config WHERE tenant_id = %s AND name = %s """ cursor.execute(check_sql, (TENANT_ID, template_info['name'])) existing = cursor.fetchone() if existing: file_config_id = existing[0] # 更新现有记录 update_sql = """ UPDATE f_polic_file_config SET file_path = %s, input_data = %s, updated_time = %s, updated_by = %s, state = 1 WHERE id = %s AND tenant_id = %s """ cursor.execute(update_sql, ( template_info['file_path'], template_info['input_data'], CURRENT_TIME, UPDATED_BY, file_config_id, TENANT_ID )) print(f" ✓ 更新文件配置: {template_info['name']}, ID: {file_config_id}") else: file_config_id = generate_id() insert_sql = """ INSERT INTO f_polic_file_config (id, tenant_id, parent_id, name, input_data, file_path, created_time, created_by, updated_time, updated_by, state) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ cursor.execute(insert_sql, ( file_config_id, TENANT_ID, template_info.get('parent_id'), template_info['name'], template_info['input_data'], template_info['file_path'], CURRENT_TIME, CREATED_BY, CURRENT_TIME, CREATED_BY, 1 # state: 1表示启用 )) print(f" ✓ 创建文件配置: {template_info['name']}, ID: {file_config_id}") conn.commit() return file_config_id except Exception as e: conn.rollback() raise Exception(f"注册到数据库失败: {str(e)}") def process_templates_in_directory(base_dir: str): """处理目录下所有模板文件""" base_path = Path(base_dir) if not base_path.exists(): print(f"错误: 目录不存在 - {base_dir}") return # 连接数据库和MinIO print("=" * 80) print("连接数据库和MinIO...") print("=" * 80) conn = pymysql.connect(**DB_CONFIG) minio_client = Minio( MINIO_CONFIG['endpoint'], access_key=MINIO_CONFIG['access_key'], secret_key=MINIO_CONFIG['secret_key'], secure=MINIO_CONFIG['secure'] ) # 检查存储桶 if not minio_client.bucket_exists(BUCKET_NAME): print(f"错误: 存储桶 '{BUCKET_NAME}' 不存在") return print(f"✓ 存储桶 '{BUCKET_NAME}' 已存在\n") # 处理结果 processed_count = 0 success_count = 0 failed_count = 0 failed_files = [] # 遍历所有docx文件 print("=" * 80) print("开始处理模板文件...") print("=" * 80) print() for docx_file in sorted(base_path.rglob("*.docx")): # 跳过临时文件 if docx_file.name.startswith("~$"): continue processed_count += 1 relative_path = docx_file.relative_to(base_path) print(f"[{processed_count}] 处理: {relative_path}") try: # 提取占位符 placeholders = extract_placeholders_from_docx(str(docx_file)) print(f" 占位符数量: {len(placeholders)}") # 生成模板编码和名称 template_code = generate_template_code(docx_file.name, str(relative_path)) template_name = docx_file.name # 上传到MinIO print(f" 正在上传到MinIO...") file_path = upload_to_minio(minio_client, str(docx_file), template_name) print(f" ✓ 上传成功: {file_path}") # 准备数据库记录 input_data = json.dumps({ 'template_code': template_code, 'business_type': 'INVESTIGATION', # 默认为调查核实 'placeholders': placeholders # 保存占位符列表供参考 }, ensure_ascii=False) template_info = { 'name': template_name.replace('.docx', ''), # 去掉扩展名作为名称 'template_code': template_code, 'file_path': file_path, 'input_data': input_data, 'parent_id': None } # 注册到数据库 print(f" 正在注册到数据库...") file_config_id = register_template_to_db(conn, template_info) print(f" ✓ 注册成功,配置ID: {file_config_id}") success_count += 1 print() except Exception as e: failed_count += 1 failed_files.append((str(relative_path), str(e))) print(f" ✗ 处理失败: {str(e)}\n") # 关闭连接 conn.close() # 打印汇总 print("=" * 80) print("处理汇总") print("=" * 80) print(f"总文件数: {processed_count}") print(f"成功: {success_count}") print(f"失败: {failed_count}") if failed_files: print("\n失败的文件:") for file_path, error in failed_files: print(f" - {file_path}: {error}") print("\n处理完成!") def main(): """主函数""" template_dir = os.path.join(os.path.dirname(__file__), 'template_finish') print(f"模板目录: {template_dir}") print() process_templates_in_directory(template_dir) if __name__ == '__main__': main()