ai-business-write/process_templates_docx_only.py

590 lines
26 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
处理已转换的 .docx 模板文档,自动添加占位符
此脚本专门处理已经手动转换为 .docx 格式的文档,跳过 .doc 转换步骤
根据占位符与字段对照表,智能识别文档类型并添加相应的占位符
使用AI大模型智能分析文档内容识别可替换位置
"""
import os
import re
from pathlib import Path
from typing import Dict, List, Optional
try:
from docx import Document
except ImportError:
print("错误: 请先安装 python-docx: pip install python-docx")
exit(1)
# 尝试导入AI辅助工具
try:
from template_ai_helper import TemplateAIHelper, get_available_fields_for_document
HAS_AI_HELPER = True
except ImportError:
HAS_AI_HELPER = False
print("警告: 无法导入AI辅助工具将使用基础模式不使用AI分析")
# 项目根目录
PROJECT_ROOT = Path(__file__).parent
ORIGINAL_TEMPLATES_DIR = PROJECT_ROOT / "模板" / "原始模板"
OUTPUT_TEMPLATES_DIR = PROJECT_ROOT / "模板"
FIELD_MAPPING_FILE = PROJECT_ROOT / "占位符与字段对照表.md"
# 文档类型映射(根据文件名识别)
DOCUMENT_TYPE_MAPPING = {
"请示报告卡": {
"template_code": "REPORT_CARD",
"fields": ["target_name", "target_organization_and_position", "report_card_request_time"],
"input_fields": ["clue_info"]
},
"初步核实审批表": {
"template_code": "PRELIMINARY_VERIFICATION_APPROVAL",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth", "target_political_status", "target_professional_rank",
"clue_source", "target_issue_description", "department_opinion", "filler_name"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"初核方案": {
"template_code": "INVESTIGATION_PLAN",
"fields": [
"target_name", "target_organization_and_position", "target_work_basic_info",
"target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
"investigation_team_member_names", "investigation_location"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"附件初核方案": {
"template_code": "INVESTIGATION_PLAN",
"fields": [
"target_name", "target_organization_and_position", "target_work_basic_info",
"target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
"investigation_team_member_names", "investigation_location"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话通知书": {
"template_code": "NOTIFICATION_LETTER",
"fields": [
"target_name", "target_organization_and_position", "target_id_number",
"appointment_time", "appointment_location", "approval_time",
"handling_department", "handler_name", "notification_time", "notification_location"
],
"input_fields": ["target_basic_info_clue"]
},
"谈话笔录": {
"template_code": "INTERVIEW_RECORD",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"谈话询问对象情况摸底调查30问": {
"template_code": "INVESTIGATION_30_QUESTIONS",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"被谈话人权利义务告知书": {
"template_code": "RIGHTS_OBLIGATIONS_NOTICE",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"点对点交接单": {
"template_code": "HANDOVER_FORM",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"陪送交接单": {
"template_code": "ESCORT_HANDOVER_FORM",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"保密承诺书": {
"template_code": "CONFIDENTIALITY_COMMITMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"办案人员-办案安全保密承诺书": {
"template_code": "INVESTIGATOR_CONFIDENTIALITY_COMMITMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"请示报告卡(初核报告结论)": {
"template_code": "REPORT_CARD_CONCLUSION",
"fields": [
"investigation_team_code", "target_name", "target_problem_description", "target_attitude"
],
"input_fields": []
},
"初核情况报告": {
"template_code": "INVESTIGATION_REPORT",
"fields": [
"target_name", "commission_name", "target_work_basic_info",
"target_issue_description", "target_problem_description", "target_organization_and_position"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话审批表": {
"template_code": "INTERVIEW_APPROVAL_FORM",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话前安全风险评估表": {
"template_code": "PRE_INTERVIEW_RISK_ASSESSMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话方案": {
"template_code": "INTERVIEW_PLAN",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话后安全风险评估表": {
"template_code": "POST_INTERVIEW_RISK_ASSESSMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
}
}
# 字段名称到字段编码的映射(用于智能识别)
FIELD_NAME_TO_CODE = {
"被核查人姓名": "target_name",
"被核查人员单位及职务": "target_organization_and_position",
"被核查人员性别": "target_gender",
"被核查人员出生年月": "target_date_of_birth",
"被核查人员出生年月日": "target_date_of_birth_full",
"被核查人员政治面貌": "target_political_status",
"被核查人员职级": "target_professional_rank",
"被核查人员身份证号": "target_id_number",
"被核查人员身份证件及号码": "target_id_number",
"被核查人员住址": "target_address",
"被核查人员户籍住址": "target_registered_address",
"被核查人员联系方式": "target_contact",
"被核查人员籍贯": "target_place_of_origin",
"被核查人员民族": "target_ethnicity",
"线索来源": "clue_source",
"主要问题线索": "target_issue_description",
"被核查人问题描述": "target_problem_description",
"被核查人员工作基本情况": "target_work_basic_info",
"核查单位名称": "investigation_unit_name",
"核查组组长姓名": "investigation_team_leader_name",
"核查组成员姓名": "investigation_team_member_names",
"核查地点": "investigation_location",
"核查组代号": "investigation_team_code",
"应到时间": "appointment_time",
"应到地点": "appointment_location",
"批准时间": "approval_time",
"承办部门": "handling_department",
"承办人": "handler_name",
"谈话通知时间": "notification_time",
"谈话通知地点": "notification_location",
"请示报告卡请示时间": "report_card_request_time",
"初步核实审批表承办部门意见": "department_opinion",
"初步核实审批表填表人": "filler_name",
"被核查人员本人认识和态度": "target_attitude",
"纪委名称": "commission_name"
}
def identify_document_type(file_name: str) -> Optional[Dict]:
"""
根据文件名识别文档类型
Args:
file_name: 文件名
Returns:
文档类型配置如果无法识别返回None
"""
# 移除扩展名和常见后缀
base_name = Path(file_name).stem
base_name = base_name.replace("XXX", "").replace("(XXX)", "").replace("XXX", "")
base_name = base_name.replace("_转自DOC", "").replace("转自DOC", "")
base_name = base_name.replace("模板", "").strip()
# 尝试精确匹配
for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
if doc_type in base_name:
return config
# 如果无法精确匹配,尝试部分匹配
for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
keywords = doc_type.replace("", " ").replace("", " ").replace("(", " ").replace(")", " ").split()
if any(keyword in base_name for keyword in keywords if len(keyword) > 1):
return config
return None
def apply_ai_replacements(text: str, ai_replacements: List[Dict]) -> str:
"""
应用AI识别的替换建议
Args:
text: 原始文本
ai_replacements: AI识别的替换建议列表
Returns:
替换后的文本
"""
result_text = text
# 按置信度排序,优先处理高置信度的替换
sorted_replacements = sorted(ai_replacements, key=lambda x: x.get('confidence', 0), reverse=True)
for replacement in sorted_replacements:
original = replacement.get('original_text', '')
replacement_text = replacement.get('replacement', '')
confidence = replacement.get('confidence', 0)
# 只应用置信度大于0.7的替换
if confidence > 0.7 and original and replacement_text:
# 转义特殊字符
escaped_original = re.escape(original)
# 替换(只替换第一次出现,避免重复替换)
if escaped_original in result_text:
result_text = result_text.replace(original, replacement_text, 1)
return result_text
def process_document(input_path: Path, output_path: Path, doc_config: Dict, use_ai: bool = True) -> bool:
"""
处理单个文档,添加占位符
Args:
input_path: 输入文件路径(.docx格式
output_path: 输出文件路径
doc_config: 文档配置
use_ai: 是否使用AI分析默认True
Returns:
是否处理成功
"""
try:
# 只处理 .docx 文件
if input_path.suffix.lower() != '.docx':
print(f" ⚠ 跳过: 不是 .docx 文件 ({input_path.suffix})")
return False
# 检查文件是否存在
if not input_path.exists():
print(f" ✗ 错误: 文件不存在: {input_path}")
return False
print(f" 处理: {input_path.name}")
# 初始化AI助手如果可用
ai_helper = None
available_fields = []
if use_ai and HAS_AI_HELPER:
try:
print(f" [初始化] 正在初始化AI助手...")
ai_helper = TemplateAIHelper()
# 测试API连接
if not ai_helper.test_api_connection():
print(f" [初始化] ⚠ API连接测试失败将使用基础模式")
ai_helper = None
else:
available_fields = get_available_fields_for_document(doc_config, FIELD_NAME_TO_CODE)
print(f" [初始化] ✓ AI分析已启用可用字段: {len(available_fields)} 个)")
except Exception as e:
print(f" [初始化] ⚠ AI分析不可用: {e},将使用基础模式")
import traceback
traceback.print_exc()
ai_helper = None
# 打开文档
print(f" [读取] 正在打开文档...")
doc = Document(str(input_path))
# 统计信息
total_paragraphs = len([p for p in doc.paragraphs if p.text.strip()])
total_tables = len(doc.tables)
total_cells = sum(len(table.rows) * len(table.rows[0].cells) if table.rows else 0 for table in doc.tables)
print(f" [统计] 文档包含: {total_paragraphs} 个段落, {total_tables} 个表格, 约 {total_cells} 个单元格")
# 统计替换次数
replacement_count = 0
ai_replacement_count = 0
# 处理段落中的占位符
print(f" [处理] 开始处理段落...")
for para_idx, paragraph in enumerate(doc.paragraphs):
if not paragraph.text:
continue
text = paragraph.text
original_text = text
# 首先使用AI分析如果可用
if ai_helper and available_fields:
try:
doc_type = doc_config.get('template_code', '未知')
if para_idx % 10 == 0: # 每10个段落输出一次进度
print(f" [进度] 处理段落 {para_idx+1}/{total_paragraphs}...")
ai_replacements = ai_helper.analyze_paragraph(
text,
available_fields,
doc_type
)
if ai_replacements:
# 应用AI识别的替换
text = apply_ai_replacements(text, ai_replacements)
if text != original_text:
ai_replacement_count += len(ai_replacements)
print(f" [AI] 段落 {para_idx+1} 应用了 {len(ai_replacements)} 个替换")
except Exception as e:
print(f" [AI] ⚠ 段落 {para_idx+1} AI分析失败: {e}")
# 然后使用规则匹配(作为补充)
for field_code in doc_config.get('fields', []):
# 查找字段名称
for field_name, code in FIELD_NAME_TO_CODE.items():
if code == field_code:
# 模式1: 字段名称: XXX 或 字段名称: 具体值
pattern1 = rf"({re.escape(field_name)}[:]\s*)([^\n\r{{]+?)(\s|$|\n|\r||。)"
def replace_func1(match):
value = match.group(2).strip()
# 如果值不是占位符格式,且不是空值,则替换
if value and not value.startswith("{{") and value not in ["——", "", "-", ""]:
return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
return match.group(0)
text = re.sub(pattern1, replace_func1, text)
# 模式2: 直接替换常见的占位符XXX
pattern2 = rf"({re.escape(field_name)}[:]\s*)(XXX|xxx|待填|待填写)"
text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
break
if text != original_text:
# 替换整个段落文本
paragraph.clear()
paragraph.add_run(text)
replacement_count += 1
# 处理表格中的占位符
print(f" [处理] 开始处理表格...")
for table_idx, table in enumerate(doc.tables):
if table_idx % 5 == 0: # 每5个表格输出一次进度
print(f" [进度] 处理表格 {table_idx+1}/{total_tables}...")
for row_idx, row in enumerate(table.rows):
for col_idx, cell in enumerate(row.cells):
for paragraph in cell.paragraphs:
if not paragraph.text:
continue
text = paragraph.text
original_text = text
# 首先使用AI分析如果可用
if ai_helper and available_fields:
try:
doc_type = doc_config.get('template_code', '未知')
ai_replacements = ai_helper.analyze_table_cell(
text,
available_fields,
doc_type,
row_idx,
col_idx
)
if ai_replacements:
# 应用AI识别的替换
text = apply_ai_replacements(text, ai_replacements)
if text != original_text:
ai_replacement_count += len(ai_replacements)
except Exception as e:
pass # 静默失败,继续使用规则匹配
# 然后使用规则匹配(作为补充)
for field_code in doc_config.get('fields', []):
for field_name, code in FIELD_NAME_TO_CODE.items():
if code == field_code:
# 模式1: 字段名称: XXX 或 字段名称: 具体值
pattern1 = rf"({re.escape(field_name)}[:]\s*)([^\n\r{{]+?)(\s|$|\n|\r||。)"
def replace_func1(match):
value = match.group(2).strip()
if value and not value.startswith("{{") and value not in ["——", "", "-", ""]:
return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
return match.group(0)
text = re.sub(pattern1, replace_func1, text)
# 模式2: 直接替换常见的占位符XXX
pattern2 = rf"({re.escape(field_name)}[:]\s*)(XXX|xxx|待填|待填写)"
text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
break
if text != original_text:
paragraph.clear()
paragraph.add_run(text)
replacement_count += 1
# 确保输出目录存在
print(f" [保存] 正在保存文档...")
output_path.parent.mkdir(parents=True, exist_ok=True)
# 保存文档
doc.save(str(output_path))
print(f" [保存] ✓ 文档已保存到: {output_path}")
if replacement_count > 0 or ai_replacement_count > 0:
msg = f" ✓ 处理成功"
if ai_replacement_count > 0:
msg += f"AI识别 {ai_replacement_count}"
if replacement_count > 0:
msg += f",规则匹配 {replacement_count}"
print(msg)
else:
print(f" ⚠ 处理完成,但未找到需要替换的内容(可能已包含占位符)")
return True
except Exception as e:
print(f" ✗ 处理失败: {e}")
import traceback
traceback.print_exc()
return False
def process_all_templates():
"""
处理所有已转换的 .docx 模板文件
"""
print("="*80)
print("处理已转换的 .docx 模板文档(跳过 .doc 转换)")
print("="*80)
print()
if not ORIGINAL_TEMPLATES_DIR.exists():
print(f"错误: 原始模板目录不存在: {ORIGINAL_TEMPLATES_DIR}")
return
# 统计信息
processed_count = 0
skipped_count = 0
failed_count = 0
# 统计总文件数
all_files = []
for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
for file in files:
if file.endswith('.docx'):
all_files.append(Path(root) / file)
total_files = len(all_files)
print(f"找到 {total_files} 个 .docx 文件需要处理\n")
# 遍历所有文件,只处理 .docx 文件
file_index = 0
for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
for file in files:
# 只处理 .docx 文件,跳过 .doc 文件
if not file.endswith('.docx'):
continue
file_index += 1
input_path = Path(root) / file
# 识别文档类型
doc_config = identify_document_type(file)
if not doc_config:
print(f"\n⚠ 无法识别文档类型: {file}")
print(f" 路径: {input_path}")
skipped_count += 1
continue
# 生成输出路径(保持相对目录结构)
relative_path = input_path.relative_to(ORIGINAL_TEMPLATES_DIR)
# 清理文件名(移除转换标记)
clean_name = Path(file).stem
clean_name = clean_name.replace("_转自DOC", "").replace("转自DOC", "")
clean_name = clean_name.replace("XXX", "").replace("(XXX)", "").replace("XXX", "")
output_path = OUTPUT_TEMPLATES_DIR / relative_path.parent / f"{clean_name}.docx"
print(f"\n{'='*80}")
print(f"[{file_index}/{total_files}] 处理: {file}")
print(f"{'='*80}")
print(f" 类型: {doc_config.get('template_code', 'UNKNOWN')}")
print(f" 输入: {input_path}")
print(f" 输出: {output_path}")
# 处理文档使用AI分析
if process_document(input_path, output_path, doc_config, use_ai=True):
processed_count += 1
else:
failed_count += 1
# 输出统计信息
print("\n" + "="*80)
print("处理完成")
print("="*80)
print(f"成功处理: {processed_count} 个文件")
print(f"跳过: {skipped_count} 个文件(无法识别类型)")
print(f"失败: {failed_count} 个文件")
print(f"\n处理后的模板保存在: {OUTPUT_TEMPLATES_DIR}")
print("\n请检查生成的模板文件,确认占位符是否正确添加。")
print("如有需要,请手动调整占位符位置。")
if __name__ == '__main__':
process_all_templates()