ai-business-write/test_template_placeholder_replacement.py

361 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
测试模板占位符识别和替换功能
读取模板文件,识别占位符,用虚拟数据替换并生成新文档
"""
import os
import re
from pathlib import Path
from docx import Document
from services.document_service import DocumentService
def extract_placeholders_from_docx(file_path: str) -> set:
"""
从docx文件中提取所有占位符
Args:
file_path: docx文件路径
Returns:
占位符集合,格式: {'field_code1', 'field_code2', ...}
"""
placeholders = set()
pattern = r'\{\{([^}]+)\}\}' # 匹配 {{field_code}} 格式
try:
doc = Document(file_path)
# 从段落中提取占位符
for paragraph in doc.paragraphs:
# 获取段落的所有文本(包括 run 中的文本)
text = ''.join([run.text for run in paragraph.runs])
if not text:
text = paragraph.text
matches = re.findall(pattern, text)
for match in matches:
# 清理占位符:去除首尾空格,并将中间的空格/换行符替换为下划线
cleaned = match.strip()
# 将中间的空格、换行符、制表符等空白字符替换为下划线
cleaned = re.sub(r'\s+', '_', cleaned)
# 过滤掉不完整的占位符(包含 { 或 } 的)
if cleaned and '{' not in cleaned and '}' not in cleaned:
placeholders.add(cleaned)
# 从表格中提取占位符
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
# 获取单元格的所有文本(包括 run 中的文本)
cell_text = ''.join([run.text for para in cell.paragraphs for run in para.runs])
if not cell_text:
cell_text = cell.text
matches = re.findall(pattern, cell_text)
for match in matches:
# 清理占位符:去除首尾空格,并将中间的空格/换行符替换为下划线
cleaned = match.strip()
# 将中间的空格、换行符、制表符等空白字符替换为下划线
cleaned = re.sub(r'\s+', '_', cleaned)
# 过滤掉不完整的占位符(包含 { 或 } 的)
if cleaned and '{' not in cleaned and '}' not in cleaned:
placeholders.add(cleaned)
except Exception as e:
print(f" [错误] 读取文件失败 - {str(e)}")
import traceback
traceback.print_exc()
return placeholders
def generate_mock_data(placeholders: set) -> dict:
"""
根据占位符生成虚拟数据
Args:
placeholders: 占位符集合
Returns:
虚拟数据字典
"""
# 常见字段的虚拟数据映射
mock_data_map = {
# 基本信息
'target_name': '张三',
'target_organization_and_position': '某市某区某局副局长',
'target_gender': '',
'target_date_of_birth': '198005',
'target_date_of_birth_full': '1980年05月15日',
'target_political_status': '中共党员',
'target_professional_rank': '副处级',
'target_id_number': '110101198005151234',
'target_address': '某市某区某街道某小区1号楼101室',
'target_registered_address': '某市某区某街道某小区1号楼101室',
'target_contact': '13800138000',
'target_place_of_origin': '某省某市',
'target_ethnicity': '汉族',
# 问题线索
'clue_source': '群众举报',
'target_issue_description': '涉嫌违反工作纪律,在项目审批过程中存在不当行为',
'target_problem_description': '在项目审批过程中,未严格按照规定程序执行,存在程序不规范问题',
'target_work_basic_info': '2005年参加工作先后在某局多个科室工作2018年任现职',
'target_attitude': '被核查人表示认识到问题的严重性,愿意积极配合调查',
# 审批信息
'filler_name': '李四',
'department_opinion': '同意开展初步核实',
'approval_time': '2024年12月15日',
'report_card_request_time': '2024年12月15日',
'handling_department': '某市纪委监委第一监督检查室',
'handler_name': '王五',
# 谈话相关
'appointment_time': '2024年12月20日上午9:00',
'appointment_location': '某市纪委监委谈话室',
'notification_time': '2024年12月18日',
'notification_location': '某市纪委监委',
# 核查信息
'investigation_unit_name': '某市纪委监委',
'investigation_team_leader_name': '赵六',
'investigation_team_member_names': '赵六、孙七、周八',
'investigation_location': '某市纪委监委',
'investigation_team_code': '2024-001',
# 其他
'commission_name': '某市纪律检查委员会',
'backup_personnel': '钱九',
'assessment_opinion': '风险评估为低风险,可以开展谈话',
}
# 为所有占位符生成数据
field_data = {}
for placeholder in placeholders:
if placeholder in mock_data_map:
field_data[placeholder] = mock_data_map[placeholder]
else:
# 如果占位符不在映射中,生成默认值
field_data[placeholder] = f'[虚拟数据-{placeholder}]'
return field_data
def test_template_replacement(template_path: str, output_dir: str):
"""
测试模板占位符识别和替换
Args:
template_path: 模板文件路径
output_dir: 输出目录
"""
print(f"\n{'='*80}")
print(f"测试模板: {template_path}")
print(f"{'='*80}")
# 检查模板文件是否存在
if not os.path.exists(template_path):
print(f" [错误] 模板文件不存在: {template_path}")
return False
# 提取占位符
print(f"\n[步骤1] 提取占位符...")
placeholders = extract_placeholders_from_docx(template_path)
print(f" 发现 {len(placeholders)} 个不同的占位符:")
for placeholder in sorted(placeholders):
print(f" - {{{{ {placeholder} }}}}")
if not placeholders:
print(f" [警告] 未发现任何占位符,模板可能不需要填充数据")
return False
# 生成虚拟数据
print(f"\n[步骤2] 生成虚拟数据...")
field_data = generate_mock_data(placeholders)
print(f" 生成了 {len(field_data)} 个字段的虚拟数据")
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 使用DocumentService填充模板
print(f"\n[步骤3] 填充模板...")
try:
# 注意DocumentService需要环境变量配置但我们可以直接使用fill_template方法
# 为了测试,我们创建一个简化的填充方法
doc = Document(template_path)
placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
# 替换段落中的占位符
for paragraph in doc.paragraphs:
full_text = paragraph.text
if not full_text:
continue
# 检查是否有占位符
has_placeholder = False
for field_code in field_data.keys():
placeholder = f"{{{{{field_code}}}}}"
if placeholder in full_text:
has_placeholder = True
break
if has_placeholder:
# 执行替换
final_text = full_text
for field_code, field_value in field_data.items():
replacement_value = str(field_value) if field_value else ''
# 尝试多种格式的占位符替换(处理空格问题)
# 标准格式
placeholder = f"{{{{{field_code}}}}}"
final_text = final_text.replace(placeholder, replacement_value)
# 带空格的格式(空格替换为下划线后的字段名)
placeholder_with_spaces = f"{{{{ {field_code.replace('_', ' ')} }}}}"
if placeholder_with_spaces in final_text:
final_text = final_text.replace(placeholder_with_spaces, replacement_value)
# 正则表达式匹配(处理各种空格情况)
placeholder_pattern_variants = [
re.compile(re.escape(f"{{{{ {field_code.replace('_', ' ')} }}}}")),
re.compile(re.escape(f"{{{{{field_code.replace('_', ' ')}}}}}")),
]
for variant_pattern in placeholder_pattern_variants:
final_text = variant_pattern.sub(replacement_value, final_text)
# 替换段落文本(保持格式)
if len(paragraph.runs) == 1:
paragraph.runs[0].text = final_text
else:
# 多个run的情况合并为一个run
for run in paragraph.runs[1:]:
run.text = ''
if paragraph.runs:
paragraph.runs[0].text = final_text
# 替换表格中的占位符
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
full_text = paragraph.text
if not full_text:
continue
# 检查是否有占位符
has_placeholder = False
for field_code in field_data.keys():
placeholder = f"{{{{{field_code}}}}}"
if placeholder in full_text:
has_placeholder = True
break
if has_placeholder:
# 执行替换
final_text = full_text
for field_code, field_value in field_data.items():
replacement_value = str(field_value) if field_value else ''
# 尝试多种格式的占位符替换(处理空格问题)
# 标准格式
placeholder = f"{{{{{field_code}}}}}"
final_text = final_text.replace(placeholder, replacement_value)
# 带空格的格式(空格替换为下划线后的字段名)
placeholder_with_spaces = f"{{{{ {field_code.replace('_', ' ')} }}}}"
if placeholder_with_spaces in final_text:
final_text = final_text.replace(placeholder_with_spaces, replacement_value)
# 正则表达式匹配(处理各种空格情况)
placeholder_pattern_variants = [
re.compile(re.escape(f"{{{{ {field_code.replace('_', ' ')} }}}}")),
re.compile(re.escape(f"{{{{{field_code.replace('_', ' ')}}}}}")),
]
for variant_pattern in placeholder_pattern_variants:
final_text = variant_pattern.sub(replacement_value, final_text)
# 替换段落文本
if len(paragraph.runs) == 1:
paragraph.runs[0].text = final_text
else:
for run in paragraph.runs[1:]:
run.text = ''
if paragraph.runs:
paragraph.runs[0].text = final_text
# 生成输出文件名
template_name = Path(template_path).stem
output_file = os.path.join(output_dir, f"{template_name}_已填充.docx")
# 保存文档
doc.save(output_file)
print(f" [成功] 文档已保存到: {output_file}")
# 验证替换结果
print(f"\n[步骤4] 验证替换结果...")
verify_doc = Document(output_file)
remaining_placeholders = set()
# 检查段落
for paragraph in verify_doc.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
cleaned = match.strip()
if cleaned and '{' not in cleaned and '}' not in cleaned:
remaining_placeholders.add(cleaned)
# 检查表格
for table in verify_doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
cleaned = match.strip()
if cleaned and '{' not in cleaned and '}' not in cleaned:
remaining_placeholders.add(cleaned)
if remaining_placeholders:
print(f" [警告] 仍有 {len(remaining_placeholders)} 个占位符未替换:")
for placeholder in sorted(remaining_placeholders):
print(f" - {{{{ {placeholder} }}}}")
else:
print(f" [成功] 所有占位符已成功替换!")
return True
except Exception as e:
print(f" [错误] 填充模板失败: {str(e)}")
import traceback
traceback.print_exc()
return False
def main():
"""主函数"""
# 项目根目录
project_root = Path(__file__).parent
# 模板文件路径
template_path = project_root / "template_finish" / "2-初核模版" / "2.谈话审批" / "走读式谈话审批" / "2谈话审批表-重新制作表格.docx"
# 输出目录
output_dir = project_root / "output_temp"
print("="*80)
print("模板占位符识别和替换测试")
print("="*80)
# 测试模板
success = test_template_replacement(str(template_path), str(output_dir))
# 总结
print(f"\n{'='*80}")
print("测试总结")
print(f"{'='*80}")
print(f"模板 (2谈话审批表-重新制作表格.docx): {'[成功]' if success else '[失败]'}")
print(f"\n输出目录: {output_dir}")
print(f"{'='*80}\n")
if __name__ == "__main__":
main()