ai-business-write/rescan_and_update_templates.py
2025-12-26 09:16:31 +08:00

406 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
重新扫描模板占位符并更新数据库
1. 扫描所有本地模板文件(包括新转换的.docx文件
2. 提取所有占位符
3. 检查数据库中的模板记录
4. 更新数据库(如有变化)
"""
import os
import pymysql
from pathlib import Path
from typing import Dict, List, Set, Tuple
from dotenv import load_dotenv
import re
from docx import Document
# 加载环境变量
load_dotenv()
# 数据库配置
DB_CONFIG = {
'host': os.getenv('DB_HOST', '152.136.177.240'),
'port': int(os.getenv('DB_PORT', 5012)),
'user': os.getenv('DB_USER', 'finyx'),
'password': os.getenv('DB_PASSWORD', '6QsGK6MpePZDE57Z'),
'database': os.getenv('DB_NAME', 'finyx'),
'charset': 'utf8mb4'
}
CREATED_BY = 655162080928945152
UPDATED_BY = 655162080928945152
# 项目根目录
PROJECT_ROOT = Path(__file__).parent
TEMPLATES_DIR = PROJECT_ROOT / "template_finish"
def print_section(title):
"""打印章节标题"""
print("\n" + "="*70)
print(f" {title}")
print("="*70)
def print_result(success, message):
"""打印结果"""
status = "[OK]" if success else "[FAIL]"
print(f"{status} {message}")
def generate_id():
"""生成ID"""
import time
return int(time.time() * 1000000)
def scan_local_templates(base_dir: Path) -> Dict[str, Path]:
"""扫描本地模板文件"""
templates = {}
if not base_dir.exists():
return templates
for file_path in base_dir.rglob('*'):
if file_path.is_file():
# 只处理文档文件(优先处理.docx也包含.doc和.wps用于检查
if file_path.suffix.lower() in ['.doc', '.docx', '.wps']:
relative_path = file_path.relative_to(PROJECT_ROOT)
relative_path_str = str(relative_path).replace('\\', '/')
templates[relative_path_str] = file_path
return templates
def get_actual_tenant_id(conn) -> int:
"""获取数据库中的实际tenant_id"""
cursor = conn.cursor(pymysql.cursors.DictCursor)
try:
cursor.execute("SELECT DISTINCT tenant_id FROM f_polic_file_config LIMIT 1")
result = cursor.fetchone()
if result:
return result['tenant_id']
return 1 # 默认值
finally:
cursor.close()
def get_db_templates(conn, tenant_id: int) -> Dict[str, Dict]:
"""从数据库获取所有模板配置"""
cursor = conn.cursor(pymysql.cursors.DictCursor)
try:
sql = """
SELECT id, name, file_path, state, parent_id
FROM f_polic_file_config
WHERE tenant_id = %s
"""
cursor.execute(sql, (tenant_id,))
templates = cursor.fetchall()
result = {}
for template in templates:
file_path = template['file_path']
if file_path:
result[file_path] = {
'id': template['id'],
'name': template['name'],
'file_path': file_path,
'state': template['state'],
'parent_id': template['parent_id']
}
return result
finally:
cursor.close()
def extract_placeholders_from_docx(file_path: Path) -> Tuple[Set[str], bool]:
"""
从docx文件中提取所有占位符
Returns:
(占位符集合, 是否成功读取)
"""
placeholders = set()
placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
success = False
try:
doc = Document(file_path)
success = True
# 从段落中提取占位符
for paragraph in doc.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
field_code = match.strip()
if field_code:
placeholders.add(field_code)
# 从表格中提取占位符
for table in doc.tables:
try:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
field_code = match.strip()
if field_code:
placeholders.add(field_code)
except Exception as e:
# 某些表格结构可能导致错误,跳过
continue
except Exception as e:
# 文件读取失败(可能是.doc格式或其他问题
return placeholders, False
return placeholders, success
def scan_all_templates_placeholders(local_templates: Dict[str, Path]) -> Dict[str, Tuple[Set[str], bool, str]]:
"""
扫描所有模板的占位符
Returns:
字典key为相对路径value为(占位符集合, 是否成功读取, 文件扩展名)
"""
results = {}
for rel_path, file_path in local_templates.items():
file_ext = file_path.suffix.lower()
placeholders, success = extract_placeholders_from_docx(file_path)
results[rel_path] = (placeholders, success, file_ext)
return results
def update_or_create_template(conn, tenant_id: int, rel_path: str, file_path: Path, db_templates: Dict[str, Dict]):
"""更新或创建模板记录"""
cursor = conn.cursor()
try:
# 检查是否已存在
if rel_path in db_templates:
# 已存在,检查是否需要更新
template_id = db_templates[rel_path]['id']
# 这里可以添加更新逻辑,比如更新名称等
return template_id, 'exists'
else:
# 不存在,创建新记录
template_id = generate_id()
file_name = file_path.stem # 不含扩展名的文件名
cursor.execute("""
INSERT INTO f_polic_file_config
(id, tenant_id, parent_id, name, input_data, file_path, created_time, created_by, updated_time, updated_by, state)
VALUES (%s, %s, %s, %s, %s, %s, NOW(), %s, NOW(), %s, 1)
""", (
template_id,
tenant_id,
None, # parent_id
file_name,
'{}', # input_data
rel_path,
CREATED_BY,
UPDATED_BY
))
conn.commit()
return template_id, 'created'
except Exception as e:
conn.rollback()
raise e
finally:
cursor.close()
def main():
"""主函数"""
print_section("重新扫描模板占位符并更新数据库")
# 1. 扫描本地模板
print_section("1. 扫描本地模板文件")
local_templates = scan_local_templates(TEMPLATES_DIR)
print_result(True, f"找到 {len(local_templates)} 个本地模板文件")
# 统计文件类型
file_types = {}
for file_path in local_templates.values():
ext = file_path.suffix.lower()
file_types[ext] = file_types.get(ext, 0) + 1
print("\n文件类型统计:")
for ext, count in sorted(file_types.items()):
print(f" {ext}: {count}")
if not local_templates:
print_result(False, "未找到本地模板文件")
return
# 2. 连接数据库
print_section("2. 连接数据库")
try:
conn = pymysql.connect(**DB_CONFIG)
print_result(True, "数据库连接成功")
except Exception as e:
print_result(False, f"数据库连接失败: {str(e)}")
return
try:
# 3. 获取实际的tenant_id
print_section("3. 获取实际的tenant_id")
tenant_id = get_actual_tenant_id(conn)
print_result(True, f"实际tenant_id: {tenant_id}")
# 4. 获取数据库中的模板
print_section("4. 获取数据库中的模板配置")
db_templates = get_db_templates(conn, tenant_id)
print_result(True, f"找到 {len(db_templates)} 条数据库模板记录有file_path的")
# 5. 扫描所有模板的占位符
print_section("5. 扫描所有模板的占位符")
print(" 正在扫描,请稍候...")
template_placeholders = scan_all_templates_placeholders(local_templates)
# 统计结果
all_placeholders = set()
templates_with_placeholders = 0
templates_without_placeholders = 0
templates_read_success = 0
templates_read_failed = 0
doc_files = []
docx_files = []
for rel_path, (placeholders, success, file_ext) in template_placeholders.items():
all_placeholders.update(placeholders)
if success:
templates_read_success += 1
if placeholders:
templates_with_placeholders += 1
else:
templates_without_placeholders += 1
else:
templates_read_failed += 1
if file_ext == '.doc':
doc_files.append(rel_path)
if file_ext == '.docx':
docx_files.append(rel_path)
elif file_ext == '.doc':
doc_files.append(rel_path)
print(f"\n扫描结果统计:")
print(f" - 成功读取: {templates_read_success}")
print(f" - 读取失败: {templates_read_failed}")
print(f" - 有占位符: {templates_with_placeholders}")
print(f" - 无占位符: {templates_without_placeholders}")
print(f" - 发现的占位符总数: {len(all_placeholders)} 个不同的占位符")
if doc_files:
print(f"\n [注意] 发现 {len(doc_files)} 个.doc文件可能无法读取:")
for doc_file in doc_files[:5]:
print(f" - {doc_file}")
if len(doc_files) > 5:
print(f" ... 还有 {len(doc_files) - 5}")
print(f"\n .docx文件: {len(docx_files)}")
# 6. 显示所有占位符
print_section("6. 所有占位符列表")
if all_placeholders:
for placeholder in sorted(all_placeholders):
print(f" - {placeholder}")
else:
print(" 未发现占位符")
# 7. 检查并更新数据库
print_section("7. 检查并更新数据库")
missing_templates = []
for rel_path in local_templates.keys():
if rel_path not in db_templates:
missing_templates.append(rel_path)
if missing_templates:
print(f" 发现 {len(missing_templates)} 个缺失的模板记录")
created_count = 0
for rel_path in missing_templates:
file_path = local_templates[rel_path]
try:
template_id, status = update_or_create_template(conn, tenant_id, rel_path, file_path, db_templates)
if status == 'created':
print(f" [创建] ID={template_id}, 路径={rel_path}")
created_count += 1
except Exception as e:
print(f" [错误] 创建失败: {rel_path}, 错误: {str(e)}")
if created_count > 0:
print_result(True, f"成功创建 {created_count} 条模板记录")
else:
print_result(True, "所有本地模板都已存在于数据库中")
# 8. 检查文件格式变化(.doc -> .docx
print_section("8. 检查文件格式变化")
# 检查数据库中是否有.doc路径但本地已经是.docx
format_changes = []
for db_path, db_info in db_templates.items():
if db_path.endswith('.doc'):
# 检查是否有对应的.docx文件
docx_path = db_path.replace('.doc', '.docx')
if docx_path in local_templates:
format_changes.append((db_path, docx_path, db_info))
if format_changes:
print(f" 发现 {len(format_changes)} 个文件格式变化(.doc -> .docx")
updated_count = 0
for old_path, new_path, db_info in format_changes:
try:
cursor = conn.cursor()
cursor.execute("""
UPDATE f_polic_file_config
SET file_path = %s
WHERE id = %s
""", (new_path, db_info['id']))
conn.commit()
cursor.close()
print(f" [更新] ID={db_info['id']}, 名称={db_info['name']}")
print(f" 旧路径: {old_path}")
print(f" 新路径: {new_path}")
updated_count += 1
except Exception as e:
print(f" [错误] 更新失败: {str(e)}")
if updated_count > 0:
print_result(True, f"成功更新 {updated_count} 条路径记录")
else:
print_result(True, "未发现文件格式变化")
# 9. 生成详细报告
print_section("9. 详细报告")
# 找出有占位符的模板示例
templates_with_placeholders_list = []
for rel_path, (placeholders, success, file_ext) in template_placeholders.items():
if success and placeholders and file_ext == '.docx':
templates_with_placeholders_list.append((rel_path, placeholders))
if templates_with_placeholders_list:
print(f"\n 有占位符的模板示例前5个:")
for i, (rel_path, placeholders) in enumerate(templates_with_placeholders_list[:5], 1):
print(f"\n {i}. {Path(rel_path).name}")
print(f" 路径: {rel_path}")
print(f" 占位符数量: {len(placeholders)}")
print(f" 占位符: {sorted(placeholders)}")
finally:
conn.close()
print_result(True, "数据库连接已关闭")
print_section("完成")
if __name__ == "__main__":
main()