ai-business-write/check_template_placeholders.py

156 lines
4.6 KiB
Python

"""
检查template_finish文件夹下的模板文件占位符是否可以被正确识别
"""
import os
import re
from pathlib import Path
from docx import Document
from collections import defaultdict
def extract_placeholders_from_docx(file_path):
"""
从docx文件中提取所有占位符
Args:
file_path: docx文件路径
Returns:
占位符列表,格式: ['field_code1', 'field_code2', ...]
"""
placeholders = set()
pattern = r'\{\{([^}]+)\}\}' # 匹配 {{field_code}} 格式
try:
doc = Document(file_path)
# 从段落中提取占位符
for paragraph in doc.paragraphs:
text = paragraph.text
matches = re.findall(pattern, text)
for match in matches:
placeholders.add(match.strip())
# 从表格中提取占位符
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text = paragraph.text
matches = re.findall(pattern, text)
for match in matches:
placeholders.add(match.strip())
except Exception as e:
print(f" 错误: 读取文件失败 - {str(e)}")
return []
return sorted(list(placeholders))
def check_templates_in_directory(base_dir):
"""
检查目录下所有模板文件的占位符
Args:
base_dir: 模板文件根目录
"""
base_path = Path(base_dir)
if not base_path.exists():
print(f"错误: 目录不存在 - {base_dir}")
return
# 统计信息
total_files = 0
valid_files = 0
invalid_files = 0
all_placeholders = defaultdict(set) # 文件路径 -> 占位符集合
all_unique_placeholders = set() # 所有唯一的占位符
print("=" * 80)
print("模板文件占位符检查报告")
print("=" * 80)
print()
# 遍历所有docx文件
for docx_file in base_path.rglob("*.docx"):
# 跳过临时文件(以~$开头的文件)
if docx_file.name.startswith("~$"):
continue
total_files += 1
relative_path = docx_file.relative_to(base_path)
print(f"[{total_files}] 检查文件: {relative_path}")
# 提取占位符
placeholders = extract_placeholders_from_docx(str(docx_file))
if placeholders:
valid_files += 1
all_placeholders[str(relative_path)] = placeholders
all_unique_placeholders.update(placeholders)
print(f" ✓ 找到 {len(placeholders)} 个占位符:")
for i, placeholder in enumerate(placeholders, 1):
print(f" {i}. {{{{ {placeholder} }}}}")
else:
invalid_files += 1
print(f" ⚠ 未找到占位符")
print()
# 打印汇总信息
print("=" * 80)
print("检查汇总")
print("=" * 80)
print(f"总文件数: {total_files}")
print(f"包含占位符的文件: {valid_files}")
print(f"未找到占位符的文件: {invalid_files}")
print(f"唯一占位符总数: {len(all_unique_placeholders)}")
print()
# 打印所有唯一占位符
if all_unique_placeholders:
print("所有唯一占位符列表:")
for i, placeholder in enumerate(sorted(all_unique_placeholders), 1):
print(f" {i}. {{{{ {placeholder} }}}}")
print()
# 打印每个文件的占位符详情
print("=" * 80)
print("各文件占位符详情")
print("=" * 80)
for file_path, placeholders in sorted(all_placeholders.items()):
print(f"\n文件: {file_path}")
print(f"占位符数量: {len(placeholders)}")
for placeholder in placeholders:
print(f" - {{{{ {placeholder} }}}}")
# 返回结果供其他脚本使用
return {
'total_files': total_files,
'valid_files': valid_files,
'invalid_files': invalid_files,
'all_placeholders': dict(all_placeholders),
'unique_placeholders': sorted(all_unique_placeholders)
}
def main():
"""主函数"""
template_dir = os.path.join(os.path.dirname(__file__), 'template_finish')
print(f"检查目录: {template_dir}")
print()
result = check_templates_in_directory(template_dir)
if result:
print("\n" + "=" * 80)
print("检查完成!")
print("=" * 80)
if __name__ == '__main__':
main()