ai-business-write/register_templates_to_db.py

351 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
批量将template_finish文件夹下的模板文件注册到数据库并上传到MinIO
"""
import os
import re
import json
import pymysql
from minio import Minio
from minio.error import S3Error
from datetime import datetime
from pathlib import Path
from docx import Document
from typing import Dict, List, Optional
# MinIO连接配置
MINIO_CONFIG = {
'endpoint': 'minio.datacubeworld.com:9000',
'access_key': 'JOLXFXny3avFSzB0uRA5',
'secret_key': 'G1BR8jStNfovkfH5ou39EmPl34E4l7dGrnd3Cz0I',
'secure': True
}
# 数据库连接配置
DB_CONFIG = {
'host': '152.136.177.240',
'port': 5012,
'user': 'finyx',
'password': '6QsGK6MpePZDE57Z',
'database': 'finyx',
'charset': 'utf8mb4'
}
# 固定值
TENANT_ID = 615873064429507639
CREATED_BY = 655162080928945152
UPDATED_BY = 655162080928945152
CURRENT_TIME = datetime.now()
BUCKET_NAME = 'finyx'
def generate_id():
"""生成ID"""
import time
import random
timestamp = int(time.time() * 1000)
random_part = random.randint(100000, 999999)
return timestamp * 1000 + random_part
def extract_placeholders_from_docx(file_path):
"""从docx文件中提取所有占位符"""
placeholders = set()
pattern = r'\{\{([^}]+)\}\}'
try:
doc = Document(file_path)
# 从段落中提取占位符
for paragraph in doc.paragraphs:
text = paragraph.text
matches = re.findall(pattern, text)
for match in matches:
placeholders.add(match.strip())
# 从表格中提取占位符
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text = paragraph.text
matches = re.findall(pattern, text)
for match in matches:
placeholders.add(match.strip())
except Exception as e:
print(f" 错误: 读取文件失败 - {str(e)}")
return []
return sorted(list(placeholders))
def generate_template_code(file_name: str, relative_path: str) -> str:
"""
根据文件名和路径生成模板编码
例如:
- "2.初步核实审批表XXX.docx" -> "PRELIMINARY_VERIFICATION_APPROVAL"
- "1.请示报告卡XXX.docx" -> "REQUEST_REPORT_CARD"
"""
# 提取基础名称(去掉扩展名和括号内容)
base_name = Path(file_name).stem
base_name = re.sub(r'.*?', '', base_name) # 去掉括号内容
base_name = re.sub(r'\(.*?\)', '', base_name) # 去掉英文括号内容
base_name = base_name.strip().rstrip('').rstrip('(')
# 去掉数字前缀
base_name = re.sub(r'^\d+[\.\-]?', '', base_name).strip()
# 生成编码:转换为大写,中文字符映射
code_mapping = {
'请示报告卡': 'REQUEST_REPORT_CARD',
'初步核实审批表': 'PRELIMINARY_VERIFICATION_APPROVAL',
'附件初核方案': 'PRELIMINARY_VERIFICATION_PLAN',
'谈话通知书': 'INTERVIEW_NOTICE',
'谈话审批表': 'INTERVIEW_APPROVAL',
'谈话前安全风险评估表': 'PRE_INTERVIEW_RISK_ASSESSMENT',
'谈话方案': 'INTERVIEW_PLAN',
'谈话后安全风险评估表': 'POST_INTERVIEW_RISK_ASSESSMENT',
'谈话笔录': 'INTERVIEW_RECORD',
'谈话询问对象情况摸底调查30问': 'INTERVIEW_OBJECT_INVESTIGATION',
'被谈话人权利义务告知书': 'INTERVIEWEE_RIGHTS_OBLIGATIONS_NOTICE',
'点对点交接单': 'POINT_TO_POINT_HANDOVER',
'陪送交接单': 'ESCORT_HANDOVER',
'保密承诺书': 'CONFIDENTIALITY_COMMITMENT',
'办案人员-办案安全保密承诺书': 'CASE_OFFICER_SECURITY_COMMITMENT',
'请示报告卡(初核谈话)': 'REQUEST_REPORT_CARD_INTERVIEW',
'请示报告卡(初核报告结论)': 'REQUEST_REPORT_CARD_CONCLUSION',
'XXX初核情况报告': 'PRELIMINARY_VERIFICATION_REPORT'
}
# 查找映射
for key, code in code_mapping.items():
if key in base_name:
# 如果是谈话通知书,可能需要区分第几联
if '谈话通知书' in base_name:
if '第一联' in base_name:
return 'INTERVIEW_NOTICE_FIRST'
elif '第二联' in base_name:
return 'INTERVIEW_NOTICE_SECOND'
elif '第三联' in base_name:
return 'INTERVIEW_NOTICE_THIRD'
# 如果是保密承诺书,区分是否党员
if '保密承诺书' in base_name:
if '非中共党员' in base_name or '非党员' in base_name:
return 'CONFIDENTIALITY_COMMITMENT_NON_PARTY'
elif '中共党员' in base_name or '党员' in base_name:
return 'CONFIDENTIALITY_COMMITMENT_PARTY'
return code
# 如果没有匹配,使用通用规则生成
# 将中文转换为拼音首字母简化处理实际应使用pypinyin
# 这里先使用简化规则
code = base_name.upper()
code = re.sub(r'[^\w]', '_', code)
code = re.sub(r'_+', '_', code).strip('_')
return code if code else f'TEMPLATE_{generate_id() % 1000000}'
def upload_to_minio(client: Minio, file_path: str, template_name: str) -> str:
"""上传文件到MinIO"""
try:
now = datetime.now()
object_name = f'{TENANT_ID}/TEMPLATE/{now.year}/{now.month:02d}/{template_name}'
client.fput_object(
BUCKET_NAME,
object_name,
file_path,
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
)
return f"/{object_name}"
except Exception as e:
raise Exception(f"上传到MinIO失败: {str(e)}")
def register_template_to_db(conn, template_info: Dict) -> int:
"""注册模板到数据库"""
cursor = conn.cursor()
try:
# 检查是否已存在
check_sql = """
SELECT id FROM f_polic_file_config
WHERE tenant_id = %s AND name = %s
"""
cursor.execute(check_sql, (TENANT_ID, template_info['name']))
existing = cursor.fetchone()
if existing:
file_config_id = existing[0]
# 更新现有记录
update_sql = """
UPDATE f_polic_file_config
SET file_path = %s, input_data = %s, updated_time = %s, updated_by = %s, state = 1
WHERE id = %s AND tenant_id = %s
"""
cursor.execute(update_sql, (
template_info['file_path'],
template_info['input_data'],
CURRENT_TIME,
UPDATED_BY,
file_config_id,
TENANT_ID
))
print(f" ✓ 更新文件配置: {template_info['name']}, ID: {file_config_id}")
else:
file_config_id = generate_id()
insert_sql = """
INSERT INTO f_polic_file_config
(id, tenant_id, parent_id, name, input_data, file_path, created_time, created_by, updated_time, updated_by, state)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_sql, (
file_config_id,
TENANT_ID,
template_info.get('parent_id'),
template_info['name'],
template_info['input_data'],
template_info['file_path'],
CURRENT_TIME,
CREATED_BY,
CURRENT_TIME,
CREATED_BY,
1 # state: 1表示启用
))
print(f" ✓ 创建文件配置: {template_info['name']}, ID: {file_config_id}")
conn.commit()
return file_config_id
except Exception as e:
conn.rollback()
raise Exception(f"注册到数据库失败: {str(e)}")
def process_templates_in_directory(base_dir: str):
"""处理目录下所有模板文件"""
base_path = Path(base_dir)
if not base_path.exists():
print(f"错误: 目录不存在 - {base_dir}")
return
# 连接数据库和MinIO
print("=" * 80)
print("连接数据库和MinIO...")
print("=" * 80)
conn = pymysql.connect(**DB_CONFIG)
minio_client = Minio(
MINIO_CONFIG['endpoint'],
access_key=MINIO_CONFIG['access_key'],
secret_key=MINIO_CONFIG['secret_key'],
secure=MINIO_CONFIG['secure']
)
# 检查存储桶
if not minio_client.bucket_exists(BUCKET_NAME):
print(f"错误: 存储桶 '{BUCKET_NAME}' 不存在")
return
print(f"✓ 存储桶 '{BUCKET_NAME}' 已存在\n")
# 处理结果
processed_count = 0
success_count = 0
failed_count = 0
failed_files = []
# 遍历所有docx文件
print("=" * 80)
print("开始处理模板文件...")
print("=" * 80)
print()
for docx_file in sorted(base_path.rglob("*.docx")):
# 跳过临时文件
if docx_file.name.startswith("~$"):
continue
processed_count += 1
relative_path = docx_file.relative_to(base_path)
print(f"[{processed_count}] 处理: {relative_path}")
try:
# 提取占位符
placeholders = extract_placeholders_from_docx(str(docx_file))
print(f" 占位符数量: {len(placeholders)}")
# 生成模板编码和名称
template_code = generate_template_code(docx_file.name, str(relative_path))
template_name = docx_file.name
# 上传到MinIO
print(f" 正在上传到MinIO...")
file_path = upload_to_minio(minio_client, str(docx_file), template_name)
print(f" ✓ 上传成功: {file_path}")
# 准备数据库记录
input_data = json.dumps({
'template_code': template_code,
'business_type': 'INVESTIGATION', # 默认为调查核实
'placeholders': placeholders # 保存占位符列表供参考
}, ensure_ascii=False)
template_info = {
'name': template_name.replace('.docx', ''), # 去掉扩展名作为名称
'template_code': template_code,
'file_path': file_path,
'input_data': input_data,
'parent_id': None
}
# 注册到数据库
print(f" 正在注册到数据库...")
file_config_id = register_template_to_db(conn, template_info)
print(f" ✓ 注册成功配置ID: {file_config_id}")
success_count += 1
print()
except Exception as e:
failed_count += 1
failed_files.append((str(relative_path), str(e)))
print(f" ✗ 处理失败: {str(e)}\n")
# 关闭连接
conn.close()
# 打印汇总
print("=" * 80)
print("处理汇总")
print("=" * 80)
print(f"总文件数: {processed_count}")
print(f"成功: {success_count}")
print(f"失败: {failed_count}")
if failed_files:
print("\n失败的文件:")
for file_path, error in failed_files:
print(f" - {file_path}: {error}")
print("\n处理完成!")
def main():
"""主函数"""
template_dir = os.path.join(os.path.dirname(__file__), 'template_finish')
print(f"模板目录: {template_dir}")
print()
process_templates_in_directory(template_dir)
if __name__ == '__main__':
main()