361 lines
15 KiB
Python
361 lines
15 KiB
Python
"""
|
||
测试模板占位符识别和替换功能
|
||
读取模板文件,识别占位符,用虚拟数据替换并生成新文档
|
||
"""
|
||
import os
|
||
import re
|
||
from pathlib import Path
|
||
from docx import Document
|
||
from services.document_service import DocumentService
|
||
|
||
|
||
def extract_placeholders_from_docx(file_path: str) -> set:
|
||
"""
|
||
从docx文件中提取所有占位符
|
||
|
||
Args:
|
||
file_path: docx文件路径
|
||
|
||
Returns:
|
||
占位符集合,格式: {'field_code1', 'field_code2', ...}
|
||
"""
|
||
placeholders = set()
|
||
pattern = r'\{\{([^}]+)\}\}' # 匹配 {{field_code}} 格式
|
||
|
||
try:
|
||
doc = Document(file_path)
|
||
|
||
# 从段落中提取占位符
|
||
for paragraph in doc.paragraphs:
|
||
# 获取段落的所有文本(包括 run 中的文本)
|
||
text = ''.join([run.text for run in paragraph.runs])
|
||
if not text:
|
||
text = paragraph.text
|
||
|
||
matches = re.findall(pattern, text)
|
||
for match in matches:
|
||
# 清理占位符:去除首尾空格,并将中间的空格/换行符替换为下划线
|
||
cleaned = match.strip()
|
||
# 将中间的空格、换行符、制表符等空白字符替换为下划线
|
||
cleaned = re.sub(r'\s+', '_', cleaned)
|
||
# 过滤掉不完整的占位符(包含 { 或 } 的)
|
||
if cleaned and '{' not in cleaned and '}' not in cleaned:
|
||
placeholders.add(cleaned)
|
||
|
||
# 从表格中提取占位符
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
# 获取单元格的所有文本(包括 run 中的文本)
|
||
cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
|
||
if not cell_text:
|
||
cell_text = cell.text
|
||
|
||
matches = re.findall(pattern, cell_text)
|
||
for match in matches:
|
||
# 清理占位符:去除首尾空格,并将中间的空格/换行符替换为下划线
|
||
cleaned = match.strip()
|
||
# 将中间的空格、换行符、制表符等空白字符替换为下划线
|
||
cleaned = re.sub(r'\s+', '_', cleaned)
|
||
# 过滤掉不完整的占位符(包含 { 或 } 的)
|
||
if cleaned and '{' not in cleaned and '}' not in cleaned:
|
||
placeholders.add(cleaned)
|
||
|
||
except Exception as e:
|
||
print(f" [错误] 读取文件失败 - {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
return placeholders
|
||
|
||
|
||
def generate_mock_data(placeholders: set) -> dict:
|
||
"""
|
||
根据占位符生成虚拟数据
|
||
|
||
Args:
|
||
placeholders: 占位符集合
|
||
|
||
Returns:
|
||
虚拟数据字典
|
||
"""
|
||
# 常见字段的虚拟数据映射
|
||
mock_data_map = {
|
||
# 基本信息
|
||
'target_name': '张三',
|
||
'target_organization_and_position': '某市某区某局副局长',
|
||
'target_gender': '男',
|
||
'target_date_of_birth': '198005',
|
||
'target_date_of_birth_full': '1980年05月15日',
|
||
'target_political_status': '中共党员',
|
||
'target_professional_rank': '副处级',
|
||
'target_id_number': '110101198005151234',
|
||
'target_address': '某市某区某街道某小区1号楼101室',
|
||
'target_registered_address': '某市某区某街道某小区1号楼101室',
|
||
'target_contact': '13800138000',
|
||
'target_place_of_origin': '某省某市',
|
||
'target_ethnicity': '汉族',
|
||
|
||
# 问题线索
|
||
'clue_source': '群众举报',
|
||
'target_issue_description': '涉嫌违反工作纪律,在项目审批过程中存在不当行为',
|
||
'target_problem_description': '在项目审批过程中,未严格按照规定程序执行,存在程序不规范问题',
|
||
'target_work_basic_info': '2005年参加工作,先后在某局多个科室工作,2018年任现职',
|
||
'target_attitude': '被核查人表示认识到问题的严重性,愿意积极配合调查',
|
||
|
||
# 审批信息
|
||
'filler_name': '李四',
|
||
'department_opinion': '同意开展初步核实',
|
||
'approval_time': '2024年12月15日',
|
||
'report_card_request_time': '2024年12月15日',
|
||
'handling_department': '某市纪委监委第一监督检查室',
|
||
'handler_name': '王五',
|
||
|
||
# 谈话相关
|
||
'appointment_time': '2024年12月20日上午9:00',
|
||
'appointment_location': '某市纪委监委谈话室',
|
||
'notification_time': '2024年12月18日',
|
||
'notification_location': '某市纪委监委',
|
||
|
||
# 核查信息
|
||
'investigation_unit_name': '某市纪委监委',
|
||
'investigation_team_leader_name': '赵六',
|
||
'investigation_team_member_names': '赵六、孙七、周八',
|
||
'investigation_location': '某市纪委监委',
|
||
'investigation_team_code': '2024-001',
|
||
|
||
# 其他
|
||
'commission_name': '某市纪律检查委员会',
|
||
'backup_personnel': '钱九',
|
||
'assessment_opinion': '风险评估为低风险,可以开展谈话',
|
||
}
|
||
|
||
# 为所有占位符生成数据
|
||
field_data = {}
|
||
for placeholder in placeholders:
|
||
if placeholder in mock_data_map:
|
||
field_data[placeholder] = mock_data_map[placeholder]
|
||
else:
|
||
# 如果占位符不在映射中,生成默认值
|
||
field_data[placeholder] = f'[虚拟数据-{placeholder}]'
|
||
|
||
return field_data
|
||
|
||
|
||
def test_template_replacement(template_path: str, output_dir: str):
|
||
"""
|
||
测试模板占位符识别和替换
|
||
|
||
Args:
|
||
template_path: 模板文件路径
|
||
output_dir: 输出目录
|
||
"""
|
||
print(f"\n{'='*80}")
|
||
print(f"测试模板: {template_path}")
|
||
print(f"{'='*80}")
|
||
|
||
# 检查模板文件是否存在
|
||
if not os.path.exists(template_path):
|
||
print(f" [错误] 模板文件不存在: {template_path}")
|
||
return False
|
||
|
||
# 提取占位符
|
||
print(f"\n[步骤1] 提取占位符...")
|
||
placeholders = extract_placeholders_from_docx(template_path)
|
||
print(f" 发现 {len(placeholders)} 个不同的占位符:")
|
||
for placeholder in sorted(placeholders):
|
||
print(f" - {{{{ {placeholder} }}}}")
|
||
|
||
if not placeholders:
|
||
print(f" [警告] 未发现任何占位符,模板可能不需要填充数据")
|
||
return False
|
||
|
||
# 生成虚拟数据
|
||
print(f"\n[步骤2] 生成虚拟数据...")
|
||
field_data = generate_mock_data(placeholders)
|
||
print(f" 生成了 {len(field_data)} 个字段的虚拟数据")
|
||
|
||
# 创建输出目录
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# 使用DocumentService填充模板
|
||
print(f"\n[步骤3] 填充模板...")
|
||
try:
|
||
# 注意:DocumentService需要环境变量配置,但我们可以直接使用fill_template方法
|
||
# 为了测试,我们创建一个简化的填充方法
|
||
doc = Document(template_path)
|
||
|
||
placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
|
||
|
||
# 替换段落中的占位符
|
||
for paragraph in doc.paragraphs:
|
||
full_text = paragraph.text
|
||
if not full_text:
|
||
continue
|
||
|
||
# 检查是否有占位符
|
||
has_placeholder = False
|
||
for field_code in field_data.keys():
|
||
placeholder = f"{{{{{field_code}}}}}"
|
||
if placeholder in full_text:
|
||
has_placeholder = True
|
||
break
|
||
|
||
if has_placeholder:
|
||
# 执行替换
|
||
final_text = full_text
|
||
for field_code, field_value in field_data.items():
|
||
replacement_value = str(field_value) if field_value else ''
|
||
# 尝试多种格式的占位符替换(处理空格问题)
|
||
# 标准格式
|
||
placeholder = f"{{{{{field_code}}}}}"
|
||
final_text = final_text.replace(placeholder, replacement_value)
|
||
# 带空格的格式(空格替换为下划线后的字段名)
|
||
placeholder_with_spaces = f"{{{{ {field_code.replace('_', ' ')} }}}}"
|
||
if placeholder_with_spaces in final_text:
|
||
final_text = final_text.replace(placeholder_with_spaces, replacement_value)
|
||
# 正则表达式匹配(处理各种空格情况)
|
||
placeholder_pattern_variants = [
|
||
re.compile(re.escape(f"{{{{ {field_code.replace('_', ' ')} }}}}")),
|
||
re.compile(re.escape(f"{{{{{field_code.replace('_', ' ')}}}}}")),
|
||
]
|
||
for variant_pattern in placeholder_pattern_variants:
|
||
final_text = variant_pattern.sub(replacement_value, final_text)
|
||
|
||
# 替换段落文本(保持格式)
|
||
if len(paragraph.runs) == 1:
|
||
paragraph.runs[0].text = final_text
|
||
else:
|
||
# 多个run的情况:合并为一个run
|
||
for run in paragraph.runs[1:]:
|
||
run.text = ''
|
||
if paragraph.runs:
|
||
paragraph.runs[0].text = final_text
|
||
|
||
# 替换表格中的占位符
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
for paragraph in cell.paragraphs:
|
||
full_text = paragraph.text
|
||
if not full_text:
|
||
continue
|
||
|
||
# 检查是否有占位符
|
||
has_placeholder = False
|
||
for field_code in field_data.keys():
|
||
placeholder = f"{{{{{field_code}}}}}"
|
||
if placeholder in full_text:
|
||
has_placeholder = True
|
||
break
|
||
|
||
if has_placeholder:
|
||
# 执行替换
|
||
final_text = full_text
|
||
for field_code, field_value in field_data.items():
|
||
replacement_value = str(field_value) if field_value else ''
|
||
# 尝试多种格式的占位符替换(处理空格问题)
|
||
# 标准格式
|
||
placeholder = f"{{{{{field_code}}}}}"
|
||
final_text = final_text.replace(placeholder, replacement_value)
|
||
# 带空格的格式(空格替换为下划线后的字段名)
|
||
placeholder_with_spaces = f"{{{{ {field_code.replace('_', ' ')} }}}}"
|
||
if placeholder_with_spaces in final_text:
|
||
final_text = final_text.replace(placeholder_with_spaces, replacement_value)
|
||
# 正则表达式匹配(处理各种空格情况)
|
||
placeholder_pattern_variants = [
|
||
re.compile(re.escape(f"{{{{ {field_code.replace('_', ' ')} }}}}")),
|
||
re.compile(re.escape(f"{{{{{field_code.replace('_', ' ')}}}}}")),
|
||
]
|
||
for variant_pattern in placeholder_pattern_variants:
|
||
final_text = variant_pattern.sub(replacement_value, final_text)
|
||
|
||
# 替换段落文本
|
||
if len(paragraph.runs) == 1:
|
||
paragraph.runs[0].text = final_text
|
||
else:
|
||
for run in paragraph.runs[1:]:
|
||
run.text = ''
|
||
if paragraph.runs:
|
||
paragraph.runs[0].text = final_text
|
||
|
||
# 生成输出文件名
|
||
template_name = Path(template_path).stem
|
||
output_file = os.path.join(output_dir, f"{template_name}_已填充.docx")
|
||
|
||
# 保存文档
|
||
doc.save(output_file)
|
||
print(f" [成功] 文档已保存到: {output_file}")
|
||
|
||
# 验证替换结果
|
||
print(f"\n[步骤4] 验证替换结果...")
|
||
verify_doc = Document(output_file)
|
||
remaining_placeholders = set()
|
||
|
||
# 检查段落
|
||
for paragraph in verify_doc.paragraphs:
|
||
text = paragraph.text
|
||
matches = placeholder_pattern.findall(text)
|
||
for match in matches:
|
||
cleaned = match.strip()
|
||
if cleaned and '{' not in cleaned and '}' not in cleaned:
|
||
remaining_placeholders.add(cleaned)
|
||
|
||
# 检查表格
|
||
for table in verify_doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
for paragraph in cell.paragraphs:
|
||
text = paragraph.text
|
||
matches = placeholder_pattern.findall(text)
|
||
for match in matches:
|
||
cleaned = match.strip()
|
||
if cleaned and '{' not in cleaned and '}' not in cleaned:
|
||
remaining_placeholders.add(cleaned)
|
||
|
||
if remaining_placeholders:
|
||
print(f" [警告] 仍有 {len(remaining_placeholders)} 个占位符未替换:")
|
||
for placeholder in sorted(remaining_placeholders):
|
||
print(f" - {{{{ {placeholder} }}}}")
|
||
else:
|
||
print(f" [成功] 所有占位符已成功替换!")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f" [错误] 填充模板失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
# 项目根目录
|
||
project_root = Path(__file__).parent
|
||
|
||
# 模板文件路径
|
||
template_path = project_root / "template_finish" / "2-初核模版" / "2.谈话审批" / "走读式谈话审批" / "2谈话审批表-重新制作表格.docx"
|
||
|
||
# 输出目录
|
||
output_dir = project_root / "output_temp"
|
||
|
||
print("="*80)
|
||
print("模板占位符识别和替换测试")
|
||
print("="*80)
|
||
|
||
# 测试模板
|
||
success = test_template_replacement(str(template_path), str(output_dir))
|
||
|
||
# 总结
|
||
print(f"\n{'='*80}")
|
||
print("测试总结")
|
||
print(f"{'='*80}")
|
||
print(f"模板 (2谈话审批表-重新制作表格.docx): {'[成功]' if success else '[失败]'}")
|
||
print(f"\n输出目录: {output_dir}")
|
||
print(f"{'='*80}\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|