457 lines
16 KiB
Python
457 lines
16 KiB
Python
"""
|
||
扫描 template_finish/ 目录下的模板文档,分析占位符,更新关联关系
|
||
"""
|
||
import os
|
||
import re
|
||
import sys
|
||
import pymysql
|
||
from pathlib import Path
|
||
from docx import Document
|
||
from datetime import datetime
|
||
from typing import Dict, List, Set, Optional
|
||
from dotenv import load_dotenv
|
||
|
||
# 设置输出编码为UTF-8
|
||
if sys.platform == 'win32':
|
||
import io
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||
|
||
load_dotenv()
|
||
|
||
# 数据库连接配置
|
||
DB_CONFIG = {
|
||
'host': os.getenv('DB_HOST', '152.136.177.240'),
|
||
'port': int(os.getenv('DB_PORT', 5012)),
|
||
'user': os.getenv('DB_USER', 'finyx'),
|
||
'password': os.getenv('DB_PASSWORD', '6QsGK6MpePZDE57Z'),
|
||
'database': os.getenv('DB_NAME', 'finyx'),
|
||
'charset': 'utf8mb4'
|
||
}
|
||
|
||
TENANT_ID = 615873064429507639
|
||
CREATED_BY = 655162080928945152
|
||
UPDATED_BY = 655162080928945152
|
||
CURRENT_TIME = datetime.now()
|
||
|
||
# 模板目录
|
||
TEMPLATE_DIR = 'template_finish'
|
||
|
||
|
||
def extract_placeholders_from_docx(file_path: str) -> List[str]:
|
||
"""
|
||
从docx文件中提取所有占位符
|
||
|
||
Args:
|
||
file_path: docx文件路径
|
||
|
||
Returns:
|
||
占位符列表,格式: ['field_code1', 'field_code2', ...]
|
||
"""
|
||
placeholders = set()
|
||
pattern = r'\{\{([^}]+)\}\}' # 匹配 {{field_code}} 格式
|
||
|
||
try:
|
||
doc = Document(file_path)
|
||
|
||
# 从段落中提取占位符
|
||
for paragraph in doc.paragraphs:
|
||
# 获取段落的所有文本(包括 run 中的文本)
|
||
text = ''.join([run.text for run in paragraph.runs])
|
||
if not text:
|
||
text = paragraph.text
|
||
|
||
matches = re.findall(pattern, text)
|
||
for match in matches:
|
||
cleaned = match.strip()
|
||
# 过滤掉不完整的占位符(包含 { 或 } 的)
|
||
if cleaned and '{' not in cleaned and '}' not in cleaned:
|
||
placeholders.add(cleaned)
|
||
|
||
# 从表格中提取占位符
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
# 获取单元格的所有文本(包括 run 中的文本)
|
||
cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
|
||
if not cell_text:
|
||
cell_text = cell.text
|
||
|
||
matches = re.findall(pattern, cell_text)
|
||
for match in matches:
|
||
cleaned = match.strip()
|
||
# 过滤掉不完整的占位符(包含 { 或 } 的)
|
||
if cleaned and '{' not in cleaned and '}' not in cleaned:
|
||
placeholders.add(cleaned)
|
||
|
||
except Exception as e:
|
||
print(f" 错误: 读取文件失败 - {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return []
|
||
|
||
return sorted(list(placeholders))
|
||
|
||
|
||
def get_field_mapping(conn) -> Dict[str, Dict]:
|
||
"""
|
||
获取字段映射:filed_code -> field_info
|
||
返回: {filed_code: {id, name, field_type, state}}
|
||
"""
|
||
cursor = conn.cursor(pymysql.cursors.DictCursor)
|
||
|
||
cursor.execute("""
|
||
SELECT id, name, filed_code, field_type, state
|
||
FROM f_polic_field
|
||
WHERE tenant_id = %s
|
||
""", (TENANT_ID,))
|
||
|
||
fields = cursor.fetchall()
|
||
cursor.close()
|
||
|
||
# 构建映射:filed_code -> field_info
|
||
field_map = {}
|
||
for field in fields:
|
||
field_code = field['filed_code']
|
||
# 处理 bytes 类型的 state
|
||
state = field['state']
|
||
if isinstance(state, bytes):
|
||
state = int.from_bytes(state, byteorder='big') if len(state) == 1 else 1
|
||
|
||
field_map[field_code] = {
|
||
'id': field['id'],
|
||
'name': field['name'],
|
||
'field_type': field['field_type'],
|
||
'state': state
|
||
}
|
||
|
||
return field_map
|
||
|
||
|
||
def get_template_mapping(conn) -> Dict[str, int]:
|
||
"""
|
||
获取模板映射:template_name -> template_id
|
||
返回: {template_name: template_id}
|
||
"""
|
||
cursor = conn.cursor(pymysql.cursors.DictCursor)
|
||
|
||
cursor.execute("""
|
||
SELECT id, name
|
||
FROM f_polic_file_config
|
||
WHERE tenant_id = %s
|
||
""", (TENANT_ID,))
|
||
|
||
templates = cursor.fetchall()
|
||
cursor.close()
|
||
|
||
# 构建映射:name -> id
|
||
template_map = {}
|
||
for template in templates:
|
||
name = template['name']
|
||
template_map[name] = template['id']
|
||
# 也支持带扩展名的名称
|
||
if not name.endswith('.docx'):
|
||
template_map[name + '.docx'] = template['id']
|
||
|
||
return template_map
|
||
|
||
|
||
def normalize_template_name(file_name: str) -> str:
|
||
"""
|
||
规范化模板名称,用于匹配数据库中的名称
|
||
"""
|
||
# 去掉扩展名
|
||
name = file_name.replace('.docx', '')
|
||
return name
|
||
|
||
|
||
def update_template_field_relations(conn, template_id: int, field_codes: List[str], field_map: Dict[str, Dict], dry_run: bool = True):
|
||
"""
|
||
更新模板的字段关联关系
|
||
|
||
Args:
|
||
conn: 数据库连接
|
||
template_id: 模板ID
|
||
field_codes: 字段编码列表(从模板文档中提取的占位符)
|
||
field_map: 字段映射
|
||
dry_run: 是否只是预览,不实际更新
|
||
"""
|
||
cursor = conn.cursor()
|
||
|
||
try:
|
||
# 根据字段编码查找字段ID
|
||
field_ids = []
|
||
not_found_codes = []
|
||
|
||
for field_code in field_codes:
|
||
if field_code in field_map:
|
||
field_info = field_map[field_code]
|
||
# 只使用启用的字段(state=1)
|
||
if field_info['state'] == 1:
|
||
field_ids.append(field_info['id'])
|
||
else:
|
||
not_found_codes.append(field_code)
|
||
|
||
if not_found_codes:
|
||
print(f" 警告: 以下字段编码在数据库中不存在: {not_found_codes}")
|
||
|
||
if not field_ids:
|
||
print(f" 警告: 没有找到有效的字段关联")
|
||
return
|
||
|
||
# 获取当前关联关系(包括所有状态的,但只处理 state=1 的)
|
||
cursor.execute("""
|
||
SELECT filed_id, state
|
||
FROM f_polic_file_field
|
||
WHERE tenant_id = %s AND file_id = %s
|
||
""", (TENANT_ID, template_id))
|
||
all_relations = cursor.fetchall()
|
||
|
||
# 只统计 state=1 的关联关系
|
||
current_field_ids = set()
|
||
for row in all_relations:
|
||
state = row[1]
|
||
# 处理 bytes 类型的 state
|
||
if isinstance(state, bytes):
|
||
state = int.from_bytes(state, byteorder='big') if len(state) == 1 else 0
|
||
elif state is None:
|
||
state = 0
|
||
else:
|
||
try:
|
||
state = int(state)
|
||
except:
|
||
state = 0
|
||
|
||
if state == 1:
|
||
current_field_ids.add(row[0])
|
||
|
||
print(f" 当前关联关系数: {len(current_field_ids)} (期望: {len(field_ids)})")
|
||
|
||
# 计算需要添加和删除的字段
|
||
new_field_ids = set(field_ids)
|
||
to_add = new_field_ids - current_field_ids
|
||
to_remove = current_field_ids - new_field_ids
|
||
|
||
if not to_add and not to_remove:
|
||
print(f" 无需更新,关联关系已是最新")
|
||
return
|
||
|
||
if dry_run:
|
||
print(f" [预览] 将添加 {len(to_add)} 个关联,删除 {len(to_remove)} 个关联")
|
||
if to_add:
|
||
print(f" 添加: {sorted(list(to_add))[:5]}{'...' if len(to_add) > 5 else ''}")
|
||
if to_remove:
|
||
print(f" 删除: {sorted(list(to_remove))[:5]}{'...' if len(to_remove) > 5 else ''}")
|
||
return
|
||
|
||
# 删除需要移除的关联关系(软删除,设置 state=0)
|
||
if to_remove:
|
||
placeholders = ','.join(['%s'] * len(to_remove))
|
||
cursor.execute(f"""
|
||
UPDATE f_polic_file_field
|
||
SET state = 0, updated_time = %s, updated_by = %s
|
||
WHERE tenant_id = %s AND file_id = %s AND filed_id IN ({placeholders})
|
||
""", [CURRENT_TIME, UPDATED_BY, TENANT_ID, template_id] + list(to_remove))
|
||
print(f" 已删除 {len(to_remove)} 个关联关系")
|
||
|
||
# 添加新的关联关系
|
||
if to_add:
|
||
added_count = 0
|
||
updated_count = 0
|
||
for field_id in to_add:
|
||
# 先检查是否已存在(可能是 state=0 的记录)
|
||
cursor.execute("""
|
||
SELECT id, state FROM f_polic_file_field
|
||
WHERE tenant_id = %s AND file_id = %s AND filed_id = %s
|
||
""", (TENANT_ID, template_id, field_id))
|
||
existing = cursor.fetchone()
|
||
|
||
if existing:
|
||
# 如果已存在,更新 state=1
|
||
cursor.execute("""
|
||
UPDATE f_polic_file_field
|
||
SET state = 1, updated_time = %s, updated_by = %s
|
||
WHERE tenant_id = %s AND file_id = %s AND filed_id = %s
|
||
""", (CURRENT_TIME, UPDATED_BY, TENANT_ID, template_id, field_id))
|
||
updated_count += 1
|
||
else:
|
||
# 如果不存在,插入新记录
|
||
insert_sql = """
|
||
INSERT INTO f_polic_file_field
|
||
(tenant_id, file_id, filed_id, created_time, created_by, updated_time, updated_by, state)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, 1)
|
||
"""
|
||
cursor.execute(insert_sql, (
|
||
TENANT_ID, template_id, field_id,
|
||
CURRENT_TIME, CREATED_BY, CURRENT_TIME, CREATED_BY
|
||
))
|
||
added_count += 1
|
||
|
||
print(f" 已添加 {added_count} 个新关联,更新 {updated_count} 个现有关联")
|
||
|
||
conn.commit()
|
||
print(f" [OK] 更新成功: 添加 {len(to_add)} 个,删除 {len(to_remove)} 个")
|
||
|
||
except Exception as e:
|
||
conn.rollback()
|
||
raise e
|
||
finally:
|
||
cursor.close()
|
||
|
||
|
||
def scan_and_update_templates(dry_run: bool = True):
|
||
"""
|
||
扫描模板目录,分析占位符,更新关联关系
|
||
|
||
Args:
|
||
dry_run: 是否只是预览,不实际更新
|
||
"""
|
||
print("=" * 80)
|
||
print("扫描模板文档并更新关联关系")
|
||
print("=" * 80)
|
||
print(f"模板目录: {TEMPLATE_DIR}")
|
||
print(f"租户ID: {TENANT_ID}")
|
||
print(f"模式: {'预览模式(不会实际更新数据库)' if dry_run else '更新模式(会更新数据库)'}")
|
||
print()
|
||
|
||
# 连接数据库
|
||
conn = pymysql.connect(**DB_CONFIG)
|
||
|
||
try:
|
||
# 获取字段映射和模板映射
|
||
print("加载数据库数据...")
|
||
field_map = get_field_mapping(conn)
|
||
template_map = get_template_mapping(conn)
|
||
print(f" 字段总数: {len(field_map)}")
|
||
print(f" 模板总数: {len(template_map)}")
|
||
print()
|
||
|
||
# 扫描模板目录
|
||
template_path = Path(TEMPLATE_DIR)
|
||
if not template_path.exists():
|
||
print(f"错误: 模板目录不存在: {TEMPLATE_DIR}")
|
||
return
|
||
|
||
docx_files = list(template_path.rglob("*.docx"))
|
||
# 过滤掉临时文件
|
||
docx_files = [f for f in docx_files if not f.name.startswith("~$")]
|
||
|
||
print(f"找到 {len(docx_files)} 个模板文件")
|
||
print()
|
||
|
||
# 统计信息
|
||
processed_count = 0
|
||
updated_count = 0
|
||
not_found_count = 0
|
||
error_count = 0
|
||
|
||
# 处理每个模板文件
|
||
for docx_file in sorted(docx_files):
|
||
processed_count += 1
|
||
relative_path = docx_file.relative_to(template_path)
|
||
template_name = normalize_template_name(docx_file.name)
|
||
|
||
print(f"[{processed_count}/{len(docx_files)}] {relative_path}")
|
||
print(f" 模板名称: {template_name}")
|
||
|
||
# 查找对应的模板ID
|
||
template_id = None
|
||
if template_name in template_map:
|
||
template_id = template_map[template_name]
|
||
elif docx_file.name in template_map:
|
||
template_id = template_map[docx_file.name]
|
||
else:
|
||
print(f" [ERROR] 未找到对应的模板配置")
|
||
not_found_count += 1
|
||
print()
|
||
continue
|
||
|
||
print(f" 模板ID: {template_id}")
|
||
|
||
try:
|
||
# 提取占位符
|
||
placeholders = extract_placeholders_from_docx(str(docx_file))
|
||
print(f" 占位符数量: {len(placeholders)}")
|
||
|
||
if not placeholders:
|
||
print(f" [WARN] 未找到占位符")
|
||
print()
|
||
continue
|
||
|
||
# 分类输入字段和输出字段
|
||
input_fields = []
|
||
output_fields = []
|
||
not_found_fields = []
|
||
|
||
for placeholder in placeholders:
|
||
if placeholder in field_map:
|
||
field_info = field_map[placeholder]
|
||
if field_info['field_type'] == 1:
|
||
input_fields.append(placeholder)
|
||
elif field_info['field_type'] == 2:
|
||
output_fields.append(placeholder)
|
||
else:
|
||
not_found_fields.append(placeholder)
|
||
|
||
# 所有模板都应该关联两个输入字段(即使模板文档中没有占位符)
|
||
# clue_info (线索信息) 和 target_basic_info_clue (被核查人员工作基本情况线索)
|
||
required_input_fields = ['clue_info', 'target_basic_info_clue']
|
||
for req_field in required_input_fields:
|
||
if req_field in field_map:
|
||
field_info = field_map[req_field]
|
||
# 只添加启用的字段
|
||
if field_info['state'] == 1 and req_field not in input_fields:
|
||
input_fields.append(req_field)
|
||
|
||
print(f" 输入字段: {len(input_fields)} (包含必需字段), 输出字段: {len(output_fields)}")
|
||
if input_fields:
|
||
print(f" 输入字段编码: {input_fields}")
|
||
if output_fields:
|
||
print(f" 输出字段编码: {output_fields[:10]}{'...' if len(output_fields) > 10 else ''}")
|
||
if not_found_fields:
|
||
print(f" 未找到的字段编码: {not_found_fields[:5]}{'...' if len(not_found_fields) > 5 else ''}")
|
||
|
||
# 合并所有字段编码(输入字段 + 输出字段)
|
||
all_field_codes = input_fields + output_fields
|
||
|
||
# 更新关联关系
|
||
print(f" 更新关联关系...")
|
||
update_template_field_relations(
|
||
conn, template_id, all_field_codes, field_map, dry_run=dry_run
|
||
)
|
||
|
||
updated_count += 1
|
||
print()
|
||
|
||
except Exception as e:
|
||
print(f" [ERROR] 处理失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
error_count += 1
|
||
print()
|
||
|
||
# 输出统计信息
|
||
print("=" * 80)
|
||
print("处理完成")
|
||
print("=" * 80)
|
||
print(f"总文件数: {len(docx_files)}")
|
||
print(f"处理成功: {updated_count}")
|
||
print(f"未找到模板: {not_found_count}")
|
||
print(f"处理失败: {error_count}")
|
||
|
||
if dry_run:
|
||
print()
|
||
print("注意: 这是预览模式,未实际更新数据库")
|
||
print("要实际更新,请运行: python update_template_field_relations_from_docx.py --update")
|
||
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import sys
|
||
|
||
# 检查是否有 --update 参数
|
||
dry_run = '--update' not in sys.argv
|
||
|
||
scan_and_update_templates(dry_run=dry_run)
|