ai-business-write/process_templates.py

739 lines
29 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
处理原始模板文档,自动添加占位符
根据占位符与字段对照表,智能识别文档类型并添加相应的占位符
使用AI大模型智能分析文档内容识别可替换位置
"""
import os
import re
from pathlib import Path
from typing import Dict, List, Optional
import json
try:
from docx import Document
from docx.shared import Pt
except ImportError:
print("错误: 请先安装 python-docx: pip install python-docx")
exit(1)
# 尝试导入AI辅助工具
try:
from template_ai_helper import TemplateAIHelper, get_available_fields_for_document
HAS_AI_HELPER = True
except ImportError:
HAS_AI_HELPER = False
print("警告: 无法导入AI辅助工具将使用基础模式不使用AI分析")
# 尝试导入win32com用于.doc文件转换Windows系统
HAS_WIN32COM = False
HAS_PYTHONCOM = False
try:
import win32com.client
HAS_WIN32COM = True
try:
import pythoncom
HAS_PYTHONCOM = True
except ImportError:
pass
except ImportError:
pass
if not HAS_WIN32COM:
print("="*60)
print("警告: 未安装 pywin32无法自动转换 .doc 文件")
print("="*60)
print("解决方案:")
print(" 1. 安装 pywin32: pip install pywin32")
print(" 2. 或者手动将所有 .doc 文件转换为 .docx 格式")
print(" 3. 转换后重新运行此脚本")
print("="*60)
# 项目根目录
PROJECT_ROOT = Path(__file__).parent
ORIGINAL_TEMPLATES_DIR = PROJECT_ROOT / "模板" / "原始模板"
OUTPUT_TEMPLATES_DIR = PROJECT_ROOT / "模板"
FIELD_MAPPING_FILE = PROJECT_ROOT / "占位符与字段对照表.md"
# 文档类型映射(根据文件名识别)
DOCUMENT_TYPE_MAPPING = {
"请示报告卡": {
"template_code": "REPORT_CARD",
"fields": ["target_name", "target_organization_and_position", "report_card_request_time"],
"input_fields": ["clue_info"]
},
"初步核实审批表": {
"template_code": "PRELIMINARY_VERIFICATION_APPROVAL",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth", "target_political_status", "target_professional_rank",
"clue_source", "target_issue_description", "department_opinion", "filler_name"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"初核方案": {
"template_code": "INVESTIGATION_PLAN",
"fields": [
"target_name", "target_organization_and_position", "target_work_basic_info",
"target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
"investigation_team_member_names", "investigation_location"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话通知书": {
"template_code": "NOTIFICATION_LETTER",
"fields": [
"target_name", "target_organization_and_position", "target_id_number",
"appointment_time", "appointment_location", "approval_time",
"handling_department", "handler_name", "notification_time", "notification_location"
],
"input_fields": ["target_basic_info_clue"]
},
"谈话笔录": {
"template_code": "INTERVIEW_RECORD",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"谈话询问对象情况摸底调查30问": {
"template_code": "INVESTIGATION_30_QUESTIONS",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"被谈话人权利义务告知书": {
"template_code": "RIGHTS_OBLIGATIONS_NOTICE",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"点对点交接单": {
"template_code": "HANDOVER_FORM",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"陪送交接单": {
"template_code": "ESCORT_HANDOVER_FORM",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"保密承诺书": {
"template_code": "CONFIDENTIALITY_COMMITMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"办案人员-办案安全保密承诺书": {
"template_code": "INVESTIGATOR_CONFIDENTIALITY_COMMITMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": []
},
"请示报告卡(初核报告结论)": {
"template_code": "REPORT_CARD_CONCLUSION",
"fields": [
"investigation_team_code", "target_name", "target_problem_description", "target_attitude"
],
"input_fields": []
},
"初核情况报告": {
"template_code": "INVESTIGATION_REPORT",
"fields": [
"target_name", "commission_name", "target_work_basic_info",
"target_issue_description", "target_problem_description", "target_organization_and_position"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话审批表": {
"template_code": "INTERVIEW_APPROVAL_FORM",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话前安全风险评估表": {
"template_code": "PRE_INTERVIEW_RISK_ASSESSMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话方案": {
"template_code": "INTERVIEW_PLAN",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
},
"谈话后安全风险评估表": {
"template_code": "POST_INTERVIEW_RISK_ASSESSMENT",
"fields": [
"target_name", "target_organization_and_position", "target_gender",
"target_date_of_birth_full", "target_political_status", "target_address",
"target_registered_address", "target_contact", "target_place_of_origin",
"target_ethnicity", "target_id_number", "investigation_team_code"
],
"input_fields": ["clue_info", "target_basic_info_clue"]
}
}
# 字段名称到字段编码的映射(用于智能识别)
FIELD_NAME_TO_CODE = {
"被核查人姓名": "target_name",
"被核查人员单位及职务": "target_organization_and_position",
"被核查人员性别": "target_gender",
"被核查人员出生年月": "target_date_of_birth",
"被核查人员出生年月日": "target_date_of_birth_full",
"被核查人员政治面貌": "target_political_status",
"被核查人员职级": "target_professional_rank",
"被核查人员身份证号": "target_id_number",
"被核查人员身份证件及号码": "target_id_number",
"被核查人员住址": "target_address",
"被核查人员户籍住址": "target_registered_address",
"被核查人员联系方式": "target_contact",
"被核查人员籍贯": "target_place_of_origin",
"被核查人员民族": "target_ethnicity",
"线索来源": "clue_source",
"主要问题线索": "target_issue_description",
"被核查人问题描述": "target_problem_description",
"被核查人员工作基本情况": "target_work_basic_info",
"核查单位名称": "investigation_unit_name",
"核查组组长姓名": "investigation_team_leader_name",
"核查组成员姓名": "investigation_team_member_names",
"核查地点": "investigation_location",
"核查组代号": "investigation_team_code",
"应到时间": "appointment_time",
"应到地点": "appointment_location",
"批准时间": "approval_time",
"承办部门": "handling_department",
"承办人": "handler_name",
"谈话通知时间": "notification_time",
"谈话通知地点": "notification_location",
"请示报告卡请示时间": "report_card_request_time",
"初步核实审批表承办部门意见": "department_opinion",
"初步核实审批表填表人": "filler_name",
"被核查人员本人认识和态度": "target_attitude",
"纪委名称": "commission_name"
}
def convert_doc_to_docx(doc_path: Path) -> Optional[Path]:
"""
将.doc文件转换为.docx格式Windows系统使用win32com
Args:
doc_path: .doc文件路径
Returns:
转换后的.docx文件路径如果失败返回None
"""
if not HAS_WIN32COM:
print(f" 警告: 未安装 pywin32无法转换 {doc_path.name}")
print(f" 解决方案: pip install pywin32")
print(f" 或者: 请手动将 {doc_path.name} 转换为 .docx 格式")
return None
word = None
doc = None
try:
# 初始化COM如果可用
if HAS_PYTHONCOM:
pythoncom.CoInitialize()
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
word.DisplayAlerts = 0 # 不显示警告
docx_path = doc_path.with_suffix('.docx')
# 检查源文件是否存在
if not doc_path.exists():
print(f" ✗ 错误: 源文件不存在: {doc_path}")
if word:
word.Quit()
return None
# 打开.doc文件使用绝对路径
abs_doc_path = str(doc_path.absolute())
abs_docx_path = str(docx_path.absolute())
print(f" 正在转换...")
print(f" 源: {doc_path.name}")
print(f" 目标: {docx_path.name}")
# 打开文档
doc = word.Documents.Open(
abs_doc_path,
ReadOnly=True,
ConfirmConversions=False,
AddToRecentFiles=False
)
# 另存为.docx格式 (16 = wdFormatXMLDocument)
doc.SaveAs2(
abs_docx_path,
FileFormat=16 # wdFormatXMLDocument
)
# 关闭文档
doc.Close(False) # False表示不保存更改
doc = None
# 退出Word
word.Quit()
word = None
# 检查转换后的文件是否存在
if docx_path.exists() and docx_path.stat().st_size > 0:
file_size = docx_path.stat().st_size
print(f" ✓ 转换成功 ({file_size} 字节)")
return docx_path
else:
print(f" ✗ 转换失败: 目标文件不存在或为空")
return None
except Exception as e:
error_msg = str(e)
error_type = type(e).__name__
print(f" ✗ 转换失败: {error_type}: {error_msg}")
# 清理资源
try:
if doc:
doc.Close(False)
except:
pass
try:
if word:
word.Quit()
except:
pass
# 提供更详细的错误信息和解决方案
print(f" 诊断信息:")
if "Word.Application" in error_msg or "COM" in error_msg or "CreateObject" in error_msg:
print(f" - 可能原因: Microsoft Word 未安装或无法访问")
print(f" - 解决方案:")
print(f" 1. 确保已安装 Microsoft Word不是 WPS")
print(f" 2. 手动将 .doc 文件转换为 .docx 格式")
print(f" 3. 使用 Word 打开文件,另存为 .docx 格式")
elif "pywin32" in error_msg.lower() or "win32com" in error_msg.lower():
print(f" - 解决方案: pip install pywin32")
elif "权限" in error_msg or "Permission" in error_msg:
print(f" - 可能原因: 文件被其他程序占用或权限不足")
print(f" - 解决方案: 关闭文件,检查文件权限")
else:
print(f" - 请检查错误信息并手动转换文件")
return None
finally:
# 清理COM
if HAS_PYTHONCOM:
try:
pythoncom.CoUninitialize()
except:
pass
def identify_document_type(file_name: str) -> Optional[Dict]:
"""
根据文件名识别文档类型
Args:
file_name: 文件名
Returns:
文档类型配置如果无法识别返回None
"""
# 移除扩展名和常见后缀
base_name = Path(file_name).stem
base_name = base_name.replace("XXX", "").replace("(XXX)", "").replace("XXX", "")
base_name = base_name.strip()
# 尝试匹配文档类型
for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
if doc_type in base_name:
return config
# 如果无法精确匹配,尝试部分匹配
for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
if any(keyword in base_name for keyword in doc_type.split()):
return config
return None
def find_placeholder_positions(text: str, field_name: str, field_code: str) -> List[tuple]:
"""
在文本中查找可能需要替换为占位符的位置
Args:
text: 文本内容
field_name: 字段名称
field_code: 字段编码
Returns:
找到的位置列表 (start, end, replacement_text)
"""
positions = []
# 查找字段名称后的内容
pattern = rf"{re.escape(field_name)}[:]\s*([^\n\r]+)"
matches = re.finditer(pattern, text)
for match in matches:
value = match.group(1).strip()
# 如果值不是占位符格式,且不是空值,则可能需要替换
if value and not value.startswith("{{"):
# 跳过常见的示例值
if value not in ["XXX", "xxx", "-", "——", "——", "待填", "待填写"]:
positions.append((
match.start(1),
match.end(1),
f"{{{{{field_code}}}}}"
))
return positions
def replace_text_in_runs(runs, old_text: str, new_text: str) -> bool:
"""
在runs中替换文本
Args:
runs: 文本runs列表
old_text: 要替换的旧文本
new_text: 新文本
Returns:
是否进行了替换
"""
full_text = ''.join(run.text for run in runs)
if old_text not in full_text:
return False
# 找到包含旧文本的runs
current_pos = 0
for run in runs:
run_start = current_pos
run_end = current_pos + len(run.text)
if run_start <= full_text.find(old_text) < run_end:
# 在这个run中替换
run.text = run.text.replace(old_text, new_text)
return True
current_pos = run_end
return False
def apply_ai_replacements(text: str, ai_replacements: List[Dict]) -> str:
"""
应用AI识别的替换建议
Args:
text: 原始文本
ai_replacements: AI识别的替换建议列表
Returns:
替换后的文本
"""
result_text = text
# 按置信度排序,优先处理高置信度的替换
sorted_replacements = sorted(ai_replacements, key=lambda x: x.get('confidence', 0), reverse=True)
for replacement in sorted_replacements:
original = replacement.get('original_text', '')
replacement_text = replacement.get('replacement', '')
confidence = replacement.get('confidence', 0)
# 只应用置信度大于0.7的替换
if confidence > 0.7 and original and replacement_text:
# 转义特殊字符
escaped_original = re.escape(original)
# 替换(只替换第一次出现,避免重复替换)
if escaped_original in result_text:
result_text = result_text.replace(original, replacement_text, 1)
return result_text
def process_document(input_path: Path, output_path: Path, doc_config: Dict, use_ai: bool = True) -> bool:
"""
处理单个文档,添加占位符
Args:
input_path: 输入文件路径
output_path: 输出文件路径
doc_config: 文档配置
use_ai: 是否使用AI分析默认True
Returns:
是否处理成功
"""
try:
# 如果是.doc文件先转换为.docx
if input_path.suffix.lower() == '.doc':
print(f" 转换 .doc 到 .docx: {input_path.name}")
docx_path = convert_doc_to_docx(input_path)
if not docx_path or not docx_path.exists():
print(f" ⚠ 跳过: 无法转换 {input_path.name}")
return False
input_path = docx_path
# 初始化AI助手如果可用
ai_helper = None
available_fields = []
if use_ai and HAS_AI_HELPER:
try:
ai_helper = TemplateAIHelper()
available_fields = get_available_fields_for_document(doc_config, FIELD_NAME_TO_CODE)
print(f" ✓ AI分析已启用")
except Exception as e:
print(f" ⚠ AI分析不可用: {e},将使用基础模式")
ai_helper = None
# 打开文档
doc = Document(str(input_path))
# 统计替换次数
replacement_count = 0
ai_replacement_count = 0
# 处理段落中的占位符
for para_idx, paragraph in enumerate(doc.paragraphs):
if not paragraph.text:
continue
text = paragraph.text
original_text = text
# 首先使用AI分析如果可用
if ai_helper and available_fields:
try:
doc_type = doc_config.get('template_code', '未知')
ai_replacements = ai_helper.analyze_paragraph(
text,
available_fields,
doc_type
)
if ai_replacements:
# 应用AI识别的替换
text = apply_ai_replacements(text, ai_replacements)
if text != original_text:
ai_replacement_count += len(ai_replacements)
except Exception as e:
print(f" ⚠ 段落 {para_idx+1} AI分析失败: {e}")
# 然后使用规则匹配(作为补充)
for field_code in doc_config.get('fields', []):
# 查找字段名称
for field_name, code in FIELD_NAME_TO_CODE.items():
if code == field_code:
# 模式1: 字段名称: XXX 或 字段名称: 具体值
pattern1 = rf"({re.escape(field_name)}[:]\s*)([^\n\r{{]+?)(\s|$|\n|\r||。)"
def replace_func1(match):
value = match.group(2).strip()
# 如果值不是占位符格式,且不是空值,则替换
if value and not value.startswith("{{") and value not in ["——", "", "-", ""]:
return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
return match.group(0)
text = re.sub(pattern1, replace_func1, text)
# 模式2: 直接替换常见的占位符XXX
pattern2 = rf"({re.escape(field_name)}[:]\s*)(XXX|xxx|待填|待填写)"
text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
break
if text != original_text:
# 替换整个段落文本
paragraph.clear()
paragraph.add_run(text)
replacement_count += 1
# 处理表格中的占位符
for table_idx, table in enumerate(doc.tables):
for row_idx, row in enumerate(table.rows):
for col_idx, cell in enumerate(row.cells):
for paragraph in cell.paragraphs:
if not paragraph.text:
continue
text = paragraph.text
original_text = text
# 首先使用AI分析如果可用
if ai_helper and available_fields:
try:
doc_type = doc_config.get('template_code', '未知')
ai_replacements = ai_helper.analyze_table_cell(
text,
available_fields,
doc_type,
row_idx,
col_idx
)
if ai_replacements:
# 应用AI识别的替换
text = apply_ai_replacements(text, ai_replacements)
if text != original_text:
ai_replacement_count += len(ai_replacements)
except Exception as e:
pass # 静默失败,继续使用规则匹配
# 然后使用规则匹配(作为补充)
for field_code in doc_config.get('fields', []):
for field_name, code in FIELD_NAME_TO_CODE.items():
if code == field_code:
# 模式1: 字段名称: XXX 或 字段名称: 具体值
pattern1 = rf"({re.escape(field_name)}[:]\s*)([^\n\r{{]+?)(\s|$|\n|\r||。)"
def replace_func1(match):
value = match.group(2).strip()
if value and not value.startswith("{{") and value not in ["——", "", "-", ""]:
return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
return match.group(0)
text = re.sub(pattern1, replace_func1, text)
# 模式2: 直接替换常见的占位符XXX
pattern2 = rf"({re.escape(field_name)}[:]\s*)(XXX|xxx|待填|待填写)"
text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
break
if text != original_text:
paragraph.clear()
paragraph.add_run(text)
replacement_count += 1
# 确保输出目录存在
output_path.parent.mkdir(parents=True, exist_ok=True)
# 保存文档
doc.save(str(output_path))
# 输出统计信息
if replacement_count > 0 or ai_replacement_count > 0:
msg = f" ✓ 处理成功"
if ai_replacement_count > 0:
msg += f"AI识别 {ai_replacement_count}"
if replacement_count > 0:
msg += f",规则匹配 {replacement_count}"
print(msg)
else:
print(f" ⚠ 处理完成,但未找到需要替换的内容(可能已包含占位符)")
return True
except Exception as e:
print(f" ✗ 处理失败: {e}")
import traceback
traceback.print_exc()
return False
def process_all_templates():
"""
处理所有原始模板文件
"""
print("="*80)
print("开始处理原始模板文档")
print("="*80)
if not ORIGINAL_TEMPLATES_DIR.exists():
print(f"错误: 原始模板目录不存在: {ORIGINAL_TEMPLATES_DIR}")
return
# 统计信息
processed_count = 0
skipped_count = 0
failed_count = 0
# 遍历所有文件
for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
for file in files:
# 只处理.doc和.docx文件
if not file.endswith(('.doc', '.docx')):
continue
input_path = Path(root) / file
# 识别文档类型
doc_config = identify_document_type(file)
if not doc_config:
print(f"\n⚠ 无法识别文档类型: {file}")
print(f" 路径: {input_path}")
skipped_count += 1
continue
# 生成输出路径(保持相对目录结构)
relative_path = input_path.relative_to(ORIGINAL_TEMPLATES_DIR)
output_path = OUTPUT_TEMPLATES_DIR / relative_path.parent / f"{Path(file).stem}.docx"
print(f"\n处理: {file}")
print(f" 类型: {doc_config.get('template_code', 'UNKNOWN')}")
print(f" 输出: {output_path}")
# 处理文档使用AI分析
if process_document(input_path, output_path, doc_config, use_ai=True):
processed_count += 1
else:
failed_count += 1
# 输出统计信息
print("\n" + "="*80)
print("处理完成")
print("="*80)
print(f"成功处理: {processed_count} 个文件")
print(f"跳过: {skipped_count} 个文件")
print(f"失败: {failed_count} 个文件")
print(f"\n处理后的模板保存在: {OUTPUT_TEMPLATES_DIR}")
print("\n请检查生成的模板文件,确认占位符是否正确添加。")
print("如有需要,请手动调整占位符位置。")
if __name__ == '__main__':
process_all_templates()