ai-business-write/test_template_reading_and_placeholder.py
2025-12-26 09:16:31 +08:00

318 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
直接测试模板读取和占位符识别功能不依赖API服务
1. 测试所有模板文件是否能正确读取
2. 验证占位符识别功能
3. 测试占位符替换功能使用DocumentService
"""
import os
import sys
from pathlib import Path
from typing import Dict, List, Set
from dotenv import load_dotenv
import re
from docx import Document
# 添加项目根目录到路径
PROJECT_ROOT = Path(__file__).parent
sys.path.insert(0, str(PROJECT_ROOT))
from services.document_service import DocumentService
# 加载环境变量
load_dotenv()
TEMPLATES_DIR = PROJECT_ROOT / "template_finish"
def print_section(title):
"""打印章节标题"""
print("\n" + "="*70)
print(f" {title}")
print("="*70)
def print_result(success, message):
"""打印结果"""
status = "[OK]" if success else "[FAIL]"
print(f"{status} {message}")
def scan_local_templates(base_dir: Path) -> Dict[str, Path]:
"""扫描本地模板文件"""
templates = {}
if not base_dir.exists():
return templates
for file_path in base_dir.rglob('*'):
if file_path.is_file() and file_path.suffix.lower() in ['.docx']:
relative_path = file_path.relative_to(PROJECT_ROOT)
relative_path_str = str(relative_path).replace('\\', '/')
templates[relative_path_str] = file_path
return templates
def extract_placeholders_from_docx(file_path: Path) -> tuple[Set[str], bool, str]:
"""从docx文件中提取所有占位符"""
placeholders = set()
placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
error_msg = ""
try:
doc = Document(file_path)
# 从段落中提取
for paragraph in doc.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
field_code = match.strip()
if field_code:
placeholders.add(field_code)
# 从表格中提取
for table in doc.tables:
try:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
field_code = match.strip()
if field_code:
placeholders.add(field_code)
except Exception as e:
# 某些表格结构可能导致错误,跳过
continue
return placeholders, True, ""
except Exception as e:
error_msg = str(e)
return placeholders, False, error_msg
def test_template_reading_and_placeholders():
"""测试模板读取和占位符识别"""
print_section("测试模板读取和占位符识别")
# 1. 扫描模板
print_section("1. 扫描本地模板文件")
local_templates = scan_local_templates(TEMPLATES_DIR)
print_result(True, f"找到 {len(local_templates)} 个.docx模板文件")
if not local_templates:
print_result(False, "未找到模板文件")
return
# 2. 测试每个模板
print_section("2. 测试模板读取和占位符识别")
all_placeholders = set()
read_success_count = 0
read_failed_count = 0
with_placeholders_count = 0
without_placeholders_count = 0
template_results = []
for i, (rel_path, file_path) in enumerate(local_templates.items(), 1):
placeholders, success, error = extract_placeholders_from_docx(file_path)
if success:
read_success_count += 1
all_placeholders.update(placeholders)
if placeholders:
with_placeholders_count += 1
else:
without_placeholders_count += 1
else:
read_failed_count += 1
template_results.append({
'path': rel_path,
'name': file_path.name,
'read_success': success,
'placeholders': placeholders,
'error': error
})
# 每10个模板显示一次进度
if i % 10 == 0:
print(f" 已处理: {i}/{len(local_templates)}")
# 3. 统计结果
print_section("3. 测试结果统计")
print(f" 总模板数: {len(local_templates)}")
print(f" 读取成功: {read_success_count}")
print(f" 读取失败: {read_failed_count}")
print(f" 有占位符: {with_placeholders_count}")
print(f" 无占位符: {without_placeholders_count}")
print(f" 发现的占位符总数: {len(all_placeholders)} 个不同的占位符")
if read_failed_count > 0:
print(f"\n 读取失败的模板:")
for result in template_results:
if not result['read_success']:
print(f" - {result['name']}: {result['error']}")
# 4. 显示所有占位符
print_section("4. 所有占位符列表")
if all_placeholders:
for placeholder in sorted(all_placeholders):
print(f" - {placeholder}")
else:
print(" 未发现占位符")
# 5. 测试DocumentService的模板读取功能
print_section("5. 测试DocumentService模板读取功能")
try:
document_service = DocumentService()
print_result(True, "DocumentService初始化成功")
# 选择几个有占位符的模板进行测试
test_templates = [r for r in template_results if r['read_success'] and r['placeholders']][:3]
if test_templates:
print(f"\n 测试 {len(test_templates)} 个模板的读取功能:")
for template_result in test_templates:
rel_path = template_result['path']
placeholders = template_result['placeholders']
print(f"\n 模板: {template_result['name']}")
print(f" 路径: {rel_path}")
print(f" 占位符: {sorted(placeholders)}")
try:
# 测试download_template_from_minio方法现在从本地读取
temp_path = document_service.download_template_from_minio(rel_path)
if temp_path and Path(temp_path).exists():
print_result(True, f"模板读取成功: {temp_path}")
# 验证文件内容
try:
doc = Document(temp_path)
print(f" 文档段落数: {len(doc.paragraphs)}")
print(f" 文档表格数: {len(doc.tables)}")
# 清理临时文件
try:
Path(temp_path).unlink()
except:
pass
except Exception as e:
print_result(False, f"验证文档内容失败: {str(e)}")
else:
print_result(False, "模板读取失败:文件不存在")
except Exception as e:
print_result(False, f"模板读取失败: {str(e)}")
else:
print_result(False, "没有找到有占位符的模板进行测试")
except Exception as e:
print_result(False, f"DocumentService初始化失败: {str(e)}")
import traceback
traceback.print_exc()
# 6. 测试占位符替换功能
print_section("6. 测试占位符替换功能")
try:
document_service = DocumentService()
# 选择一个有占位符的模板
test_template = None
for template_result in template_results:
if template_result['read_success'] and template_result['placeholders']:
test_template = template_result
break
if test_template:
rel_path = test_template['path']
placeholders = test_template['placeholders']
print(f" 测试模板: {test_template['name']}")
print(f" 路径: {rel_path}")
print(f" 占位符: {sorted(placeholders)}")
# 准备测试数据
field_data = {}
test_values = {
'target_name': '测试姓名',
'target_organization': '测试单位',
'target_position': '测试职务',
'target_organization_and_position': '测试单位-测试职务',
'investigation_team_code': 'DC2025001',
'appointment_time': '2025-12-16 14:00',
'appointment_location': '会议室A',
}
for placeholder in placeholders:
field_data[placeholder] = test_values.get(placeholder, f'测试值_{placeholder}')
print(f" 测试数据: {field_data}")
try:
# 读取模板
template_path = document_service.download_template_from_minio(rel_path)
if template_path and Path(template_path).exists():
# 填充模板
filled_doc_path = document_service.fill_template(template_path, field_data)
if filled_doc_path and Path(filled_doc_path).exists():
print_result(True, f"文档生成成功: {filled_doc_path}")
# 验证生成的文档
try:
filled_doc = Document(filled_doc_path)
# 检查是否还有未替换的占位符
remaining_placeholders = set()
placeholder_pattern = re.compile(r'\{\{([^}]+)\}\}')
for paragraph in filled_doc.paragraphs:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
field_code = match.strip()
if field_code:
remaining_placeholders.add(field_code)
if remaining_placeholders:
print_result(False, f"仍有未替换的占位符: {sorted(remaining_placeholders)}")
else:
print_result(True, "所有占位符已成功替换")
# 清理临时文件
try:
Path(template_path).unlink()
Path(filled_doc_path).unlink()
except:
pass
except Exception as e:
print_result(False, f"验证生成的文档失败: {str(e)}")
else:
print_result(False, "文档生成失败:文件不存在")
else:
print_result(False, "模板读取失败:文件不存在")
except Exception as e:
print_result(False, f"占位符替换测试失败: {str(e)}")
import traceback
traceback.print_exc()
else:
print_result(False, "没有找到有占位符的模板进行测试")
except Exception as e:
print_result(False, f"占位符替换测试初始化失败: {str(e)}")
import traceback
traceback.print_exc()
print_section("测试完成")
if __name__ == "__main__":
test_template_reading_and_placeholders()