739 lines
29 KiB
Python
739 lines
29 KiB
Python
"""
|
||
处理原始模板文档,自动添加占位符
|
||
根据占位符与字段对照表,智能识别文档类型并添加相应的占位符
|
||
使用AI大模型智能分析文档内容,识别可替换位置
|
||
"""
|
||
import os
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional
|
||
import json
|
||
|
||
try:
|
||
from docx import Document
|
||
from docx.shared import Pt
|
||
except ImportError:
|
||
print("错误: 请先安装 python-docx: pip install python-docx")
|
||
exit(1)
|
||
|
||
# 尝试导入AI辅助工具
|
||
try:
|
||
from template_ai_helper import TemplateAIHelper, get_available_fields_for_document
|
||
HAS_AI_HELPER = True
|
||
except ImportError:
|
||
HAS_AI_HELPER = False
|
||
print("警告: 无法导入AI辅助工具,将使用基础模式(不使用AI分析)")
|
||
|
||
# 尝试导入win32com用于.doc文件转换(Windows系统)
|
||
HAS_WIN32COM = False
|
||
HAS_PYTHONCOM = False
|
||
try:
|
||
import win32com.client
|
||
HAS_WIN32COM = True
|
||
try:
|
||
import pythoncom
|
||
HAS_PYTHONCOM = True
|
||
except ImportError:
|
||
pass
|
||
except ImportError:
|
||
pass
|
||
|
||
if not HAS_WIN32COM:
|
||
print("="*60)
|
||
print("警告: 未安装 pywin32,无法自动转换 .doc 文件")
|
||
print("="*60)
|
||
print("解决方案:")
|
||
print(" 1. 安装 pywin32: pip install pywin32")
|
||
print(" 2. 或者手动将所有 .doc 文件转换为 .docx 格式")
|
||
print(" 3. 转换后重新运行此脚本")
|
||
print("="*60)
|
||
|
||
# 项目根目录
|
||
PROJECT_ROOT = Path(__file__).parent
|
||
ORIGINAL_TEMPLATES_DIR = PROJECT_ROOT / "模板" / "原始模板"
|
||
OUTPUT_TEMPLATES_DIR = PROJECT_ROOT / "模板"
|
||
FIELD_MAPPING_FILE = PROJECT_ROOT / "占位符与字段对照表.md"
|
||
|
||
# 文档类型映射(根据文件名识别)
|
||
DOCUMENT_TYPE_MAPPING = {
|
||
"请示报告卡": {
|
||
"template_code": "REPORT_CARD",
|
||
"fields": ["target_name", "target_organization_and_position", "report_card_request_time"],
|
||
"input_fields": ["clue_info"]
|
||
},
|
||
"初步核实审批表": {
|
||
"template_code": "PRELIMINARY_VERIFICATION_APPROVAL",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth", "target_political_status", "target_professional_rank",
|
||
"clue_source", "target_issue_description", "department_opinion", "filler_name"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
},
|
||
"初核方案": {
|
||
"template_code": "INVESTIGATION_PLAN",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_work_basic_info",
|
||
"target_issue_description", "investigation_unit_name", "investigation_team_leader_name",
|
||
"investigation_team_member_names", "investigation_location"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
},
|
||
"谈话通知书": {
|
||
"template_code": "NOTIFICATION_LETTER",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_id_number",
|
||
"appointment_time", "appointment_location", "approval_time",
|
||
"handling_department", "handler_name", "notification_time", "notification_location"
|
||
],
|
||
"input_fields": ["target_basic_info_clue"]
|
||
},
|
||
"谈话笔录": {
|
||
"template_code": "INTERVIEW_RECORD",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"谈话询问对象情况摸底调查30问": {
|
||
"template_code": "INVESTIGATION_30_QUESTIONS",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"被谈话人权利义务告知书": {
|
||
"template_code": "RIGHTS_OBLIGATIONS_NOTICE",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"点对点交接单": {
|
||
"template_code": "HANDOVER_FORM",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"陪送交接单": {
|
||
"template_code": "ESCORT_HANDOVER_FORM",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"保密承诺书": {
|
||
"template_code": "CONFIDENTIALITY_COMMITMENT",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"办案人员-办案安全保密承诺书": {
|
||
"template_code": "INVESTIGATOR_CONFIDENTIALITY_COMMITMENT",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"请示报告卡(初核报告结论)": {
|
||
"template_code": "REPORT_CARD_CONCLUSION",
|
||
"fields": [
|
||
"investigation_team_code", "target_name", "target_problem_description", "target_attitude"
|
||
],
|
||
"input_fields": []
|
||
},
|
||
"初核情况报告": {
|
||
"template_code": "INVESTIGATION_REPORT",
|
||
"fields": [
|
||
"target_name", "commission_name", "target_work_basic_info",
|
||
"target_issue_description", "target_problem_description", "target_organization_and_position"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
},
|
||
"谈话审批表": {
|
||
"template_code": "INTERVIEW_APPROVAL_FORM",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
},
|
||
"谈话前安全风险评估表": {
|
||
"template_code": "PRE_INTERVIEW_RISK_ASSESSMENT",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
},
|
||
"谈话方案": {
|
||
"template_code": "INTERVIEW_PLAN",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
},
|
||
"谈话后安全风险评估表": {
|
||
"template_code": "POST_INTERVIEW_RISK_ASSESSMENT",
|
||
"fields": [
|
||
"target_name", "target_organization_and_position", "target_gender",
|
||
"target_date_of_birth_full", "target_political_status", "target_address",
|
||
"target_registered_address", "target_contact", "target_place_of_origin",
|
||
"target_ethnicity", "target_id_number", "investigation_team_code"
|
||
],
|
||
"input_fields": ["clue_info", "target_basic_info_clue"]
|
||
}
|
||
}
|
||
|
||
# 字段名称到字段编码的映射(用于智能识别)
|
||
FIELD_NAME_TO_CODE = {
|
||
"被核查人姓名": "target_name",
|
||
"被核查人员单位及职务": "target_organization_and_position",
|
||
"被核查人员性别": "target_gender",
|
||
"被核查人员出生年月": "target_date_of_birth",
|
||
"被核查人员出生年月日": "target_date_of_birth_full",
|
||
"被核查人员政治面貌": "target_political_status",
|
||
"被核查人员职级": "target_professional_rank",
|
||
"被核查人员身份证号": "target_id_number",
|
||
"被核查人员身份证件及号码": "target_id_number",
|
||
"被核查人员住址": "target_address",
|
||
"被核查人员户籍住址": "target_registered_address",
|
||
"被核查人员联系方式": "target_contact",
|
||
"被核查人员籍贯": "target_place_of_origin",
|
||
"被核查人员民族": "target_ethnicity",
|
||
"线索来源": "clue_source",
|
||
"主要问题线索": "target_issue_description",
|
||
"被核查人问题描述": "target_problem_description",
|
||
"被核查人员工作基本情况": "target_work_basic_info",
|
||
"核查单位名称": "investigation_unit_name",
|
||
"核查组组长姓名": "investigation_team_leader_name",
|
||
"核查组成员姓名": "investigation_team_member_names",
|
||
"核查地点": "investigation_location",
|
||
"核查组代号": "investigation_team_code",
|
||
"应到时间": "appointment_time",
|
||
"应到地点": "appointment_location",
|
||
"批准时间": "approval_time",
|
||
"承办部门": "handling_department",
|
||
"承办人": "handler_name",
|
||
"谈话通知时间": "notification_time",
|
||
"谈话通知地点": "notification_location",
|
||
"请示报告卡请示时间": "report_card_request_time",
|
||
"初步核实审批表承办部门意见": "department_opinion",
|
||
"初步核实审批表填表人": "filler_name",
|
||
"被核查人员本人认识和态度": "target_attitude",
|
||
"纪委名称": "commission_name"
|
||
}
|
||
|
||
|
||
def convert_doc_to_docx(doc_path: Path) -> Optional[Path]:
|
||
"""
|
||
将.doc文件转换为.docx格式(Windows系统使用win32com)
|
||
|
||
Args:
|
||
doc_path: .doc文件路径
|
||
|
||
Returns:
|
||
转换后的.docx文件路径,如果失败返回None
|
||
"""
|
||
if not HAS_WIN32COM:
|
||
print(f" 警告: 未安装 pywin32,无法转换 {doc_path.name}")
|
||
print(f" 解决方案: pip install pywin32")
|
||
print(f" 或者: 请手动将 {doc_path.name} 转换为 .docx 格式")
|
||
return None
|
||
|
||
word = None
|
||
doc = None
|
||
|
||
try:
|
||
# 初始化COM(如果可用)
|
||
if HAS_PYTHONCOM:
|
||
pythoncom.CoInitialize()
|
||
|
||
word = win32com.client.Dispatch("Word.Application")
|
||
word.Visible = False
|
||
word.DisplayAlerts = 0 # 不显示警告
|
||
|
||
docx_path = doc_path.with_suffix('.docx')
|
||
|
||
# 检查源文件是否存在
|
||
if not doc_path.exists():
|
||
print(f" ✗ 错误: 源文件不存在: {doc_path}")
|
||
if word:
|
||
word.Quit()
|
||
return None
|
||
|
||
# 打开.doc文件(使用绝对路径)
|
||
abs_doc_path = str(doc_path.absolute())
|
||
abs_docx_path = str(docx_path.absolute())
|
||
|
||
print(f" 正在转换...")
|
||
print(f" 源: {doc_path.name}")
|
||
print(f" 目标: {docx_path.name}")
|
||
|
||
# 打开文档
|
||
doc = word.Documents.Open(
|
||
abs_doc_path,
|
||
ReadOnly=True,
|
||
ConfirmConversions=False,
|
||
AddToRecentFiles=False
|
||
)
|
||
|
||
# 另存为.docx格式 (16 = wdFormatXMLDocument)
|
||
doc.SaveAs2(
|
||
abs_docx_path,
|
||
FileFormat=16 # wdFormatXMLDocument
|
||
)
|
||
|
||
# 关闭文档
|
||
doc.Close(False) # False表示不保存更改
|
||
doc = None
|
||
|
||
# 退出Word
|
||
word.Quit()
|
||
word = None
|
||
|
||
# 检查转换后的文件是否存在
|
||
if docx_path.exists() and docx_path.stat().st_size > 0:
|
||
file_size = docx_path.stat().st_size
|
||
print(f" ✓ 转换成功 ({file_size} 字节)")
|
||
return docx_path
|
||
else:
|
||
print(f" ✗ 转换失败: 目标文件不存在或为空")
|
||
return None
|
||
|
||
except Exception as e:
|
||
error_msg = str(e)
|
||
error_type = type(e).__name__
|
||
print(f" ✗ 转换失败: {error_type}: {error_msg}")
|
||
|
||
# 清理资源
|
||
try:
|
||
if doc:
|
||
doc.Close(False)
|
||
except:
|
||
pass
|
||
try:
|
||
if word:
|
||
word.Quit()
|
||
except:
|
||
pass
|
||
|
||
# 提供更详细的错误信息和解决方案
|
||
print(f" 诊断信息:")
|
||
if "Word.Application" in error_msg or "COM" in error_msg or "CreateObject" in error_msg:
|
||
print(f" - 可能原因: Microsoft Word 未安装或无法访问")
|
||
print(f" - 解决方案:")
|
||
print(f" 1. 确保已安装 Microsoft Word(不是 WPS)")
|
||
print(f" 2. 手动将 .doc 文件转换为 .docx 格式")
|
||
print(f" 3. 使用 Word 打开文件,另存为 .docx 格式")
|
||
elif "pywin32" in error_msg.lower() or "win32com" in error_msg.lower():
|
||
print(f" - 解决方案: pip install pywin32")
|
||
elif "权限" in error_msg or "Permission" in error_msg:
|
||
print(f" - 可能原因: 文件被其他程序占用或权限不足")
|
||
print(f" - 解决方案: 关闭文件,检查文件权限")
|
||
else:
|
||
print(f" - 请检查错误信息并手动转换文件")
|
||
|
||
return None
|
||
finally:
|
||
# 清理COM
|
||
if HAS_PYTHONCOM:
|
||
try:
|
||
pythoncom.CoUninitialize()
|
||
except:
|
||
pass
|
||
|
||
|
||
def identify_document_type(file_name: str) -> Optional[Dict]:
|
||
"""
|
||
根据文件名识别文档类型
|
||
|
||
Args:
|
||
file_name: 文件名
|
||
|
||
Returns:
|
||
文档类型配置,如果无法识别返回None
|
||
"""
|
||
# 移除扩展名和常见后缀
|
||
base_name = Path(file_name).stem
|
||
base_name = base_name.replace("(XXX)", "").replace("(XXX)", "").replace("XXX", "")
|
||
base_name = base_name.strip()
|
||
|
||
# 尝试匹配文档类型
|
||
for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
|
||
if doc_type in base_name:
|
||
return config
|
||
|
||
# 如果无法精确匹配,尝试部分匹配
|
||
for doc_type, config in DOCUMENT_TYPE_MAPPING.items():
|
||
if any(keyword in base_name for keyword in doc_type.split()):
|
||
return config
|
||
|
||
return None
|
||
|
||
|
||
def find_placeholder_positions(text: str, field_name: str, field_code: str) -> List[tuple]:
|
||
"""
|
||
在文本中查找可能需要替换为占位符的位置
|
||
|
||
Args:
|
||
text: 文本内容
|
||
field_name: 字段名称
|
||
field_code: 字段编码
|
||
|
||
Returns:
|
||
找到的位置列表 (start, end, replacement_text)
|
||
"""
|
||
positions = []
|
||
|
||
# 查找字段名称后的内容
|
||
pattern = rf"{re.escape(field_name)}[::]\s*([^\n\r]+)"
|
||
matches = re.finditer(pattern, text)
|
||
|
||
for match in matches:
|
||
value = match.group(1).strip()
|
||
# 如果值不是占位符格式,且不是空值,则可能需要替换
|
||
if value and not value.startswith("{{"):
|
||
# 跳过常见的示例值
|
||
if value not in ["XXX", "xxx", "-", "——", "——", "待填", "待填写"]:
|
||
positions.append((
|
||
match.start(1),
|
||
match.end(1),
|
||
f"{{{{{field_code}}}}}"
|
||
))
|
||
|
||
return positions
|
||
|
||
|
||
def replace_text_in_runs(runs, old_text: str, new_text: str) -> bool:
|
||
"""
|
||
在runs中替换文本
|
||
|
||
Args:
|
||
runs: 文本runs列表
|
||
old_text: 要替换的旧文本
|
||
new_text: 新文本
|
||
|
||
Returns:
|
||
是否进行了替换
|
||
"""
|
||
full_text = ''.join(run.text for run in runs)
|
||
if old_text not in full_text:
|
||
return False
|
||
|
||
# 找到包含旧文本的runs
|
||
current_pos = 0
|
||
for run in runs:
|
||
run_start = current_pos
|
||
run_end = current_pos + len(run.text)
|
||
|
||
if run_start <= full_text.find(old_text) < run_end:
|
||
# 在这个run中替换
|
||
run.text = run.text.replace(old_text, new_text)
|
||
return True
|
||
|
||
current_pos = run_end
|
||
|
||
return False
|
||
|
||
|
||
def apply_ai_replacements(text: str, ai_replacements: List[Dict]) -> str:
|
||
"""
|
||
应用AI识别的替换建议
|
||
|
||
Args:
|
||
text: 原始文本
|
||
ai_replacements: AI识别的替换建议列表
|
||
|
||
Returns:
|
||
替换后的文本
|
||
"""
|
||
result_text = text
|
||
|
||
# 按置信度排序,优先处理高置信度的替换
|
||
sorted_replacements = sorted(ai_replacements, key=lambda x: x.get('confidence', 0), reverse=True)
|
||
|
||
for replacement in sorted_replacements:
|
||
original = replacement.get('original_text', '')
|
||
replacement_text = replacement.get('replacement', '')
|
||
confidence = replacement.get('confidence', 0)
|
||
|
||
# 只应用置信度大于0.7的替换
|
||
if confidence > 0.7 and original and replacement_text:
|
||
# 转义特殊字符
|
||
escaped_original = re.escape(original)
|
||
# 替换(只替换第一次出现,避免重复替换)
|
||
if escaped_original in result_text:
|
||
result_text = result_text.replace(original, replacement_text, 1)
|
||
|
||
return result_text
|
||
|
||
|
||
def process_document(input_path: Path, output_path: Path, doc_config: Dict, use_ai: bool = True) -> bool:
|
||
"""
|
||
处理单个文档,添加占位符
|
||
|
||
Args:
|
||
input_path: 输入文件路径
|
||
output_path: 输出文件路径
|
||
doc_config: 文档配置
|
||
use_ai: 是否使用AI分析(默认True)
|
||
|
||
Returns:
|
||
是否处理成功
|
||
"""
|
||
try:
|
||
# 如果是.doc文件,先转换为.docx
|
||
if input_path.suffix.lower() == '.doc':
|
||
print(f" 转换 .doc 到 .docx: {input_path.name}")
|
||
docx_path = convert_doc_to_docx(input_path)
|
||
if not docx_path or not docx_path.exists():
|
||
print(f" ⚠ 跳过: 无法转换 {input_path.name}")
|
||
return False
|
||
input_path = docx_path
|
||
|
||
# 初始化AI助手(如果可用)
|
||
ai_helper = None
|
||
available_fields = []
|
||
if use_ai and HAS_AI_HELPER:
|
||
try:
|
||
ai_helper = TemplateAIHelper()
|
||
available_fields = get_available_fields_for_document(doc_config, FIELD_NAME_TO_CODE)
|
||
print(f" ✓ AI分析已启用")
|
||
except Exception as e:
|
||
print(f" ⚠ AI分析不可用: {e},将使用基础模式")
|
||
ai_helper = None
|
||
|
||
# 打开文档
|
||
doc = Document(str(input_path))
|
||
|
||
# 统计替换次数
|
||
replacement_count = 0
|
||
ai_replacement_count = 0
|
||
|
||
# 处理段落中的占位符
|
||
for para_idx, paragraph in enumerate(doc.paragraphs):
|
||
if not paragraph.text:
|
||
continue
|
||
|
||
text = paragraph.text
|
||
original_text = text
|
||
|
||
# 首先使用AI分析(如果可用)
|
||
if ai_helper and available_fields:
|
||
try:
|
||
doc_type = doc_config.get('template_code', '未知')
|
||
ai_replacements = ai_helper.analyze_paragraph(
|
||
text,
|
||
available_fields,
|
||
doc_type
|
||
)
|
||
|
||
if ai_replacements:
|
||
# 应用AI识别的替换
|
||
text = apply_ai_replacements(text, ai_replacements)
|
||
if text != original_text:
|
||
ai_replacement_count += len(ai_replacements)
|
||
except Exception as e:
|
||
print(f" ⚠ 段落 {para_idx+1} AI分析失败: {e}")
|
||
|
||
# 然后使用规则匹配(作为补充)
|
||
for field_code in doc_config.get('fields', []):
|
||
# 查找字段名称
|
||
for field_name, code in FIELD_NAME_TO_CODE.items():
|
||
if code == field_code:
|
||
# 模式1: 字段名称: XXX 或 字段名称: 具体值
|
||
pattern1 = rf"({re.escape(field_name)}[::]\s*)([^\n\r{{]+?)(\s|$|\n|\r|,|。)"
|
||
def replace_func1(match):
|
||
value = match.group(2).strip()
|
||
# 如果值不是占位符格式,且不是空值,则替换
|
||
if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]:
|
||
return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
|
||
return match.group(0)
|
||
text = re.sub(pattern1, replace_func1, text)
|
||
|
||
# 模式2: 直接替换常见的占位符(XXX)
|
||
pattern2 = rf"({re.escape(field_name)}[::]\s*)(XXX|xxx|待填|待填写)"
|
||
text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
|
||
break
|
||
|
||
if text != original_text:
|
||
# 替换整个段落文本
|
||
paragraph.clear()
|
||
paragraph.add_run(text)
|
||
replacement_count += 1
|
||
|
||
# 处理表格中的占位符
|
||
for table_idx, table in enumerate(doc.tables):
|
||
for row_idx, row in enumerate(table.rows):
|
||
for col_idx, cell in enumerate(row.cells):
|
||
for paragraph in cell.paragraphs:
|
||
if not paragraph.text:
|
||
continue
|
||
|
||
text = paragraph.text
|
||
original_text = text
|
||
|
||
# 首先使用AI分析(如果可用)
|
||
if ai_helper and available_fields:
|
||
try:
|
||
doc_type = doc_config.get('template_code', '未知')
|
||
ai_replacements = ai_helper.analyze_table_cell(
|
||
text,
|
||
available_fields,
|
||
doc_type,
|
||
row_idx,
|
||
col_idx
|
||
)
|
||
|
||
if ai_replacements:
|
||
# 应用AI识别的替换
|
||
text = apply_ai_replacements(text, ai_replacements)
|
||
if text != original_text:
|
||
ai_replacement_count += len(ai_replacements)
|
||
except Exception as e:
|
||
pass # 静默失败,继续使用规则匹配
|
||
|
||
# 然后使用规则匹配(作为补充)
|
||
for field_code in doc_config.get('fields', []):
|
||
for field_name, code in FIELD_NAME_TO_CODE.items():
|
||
if code == field_code:
|
||
# 模式1: 字段名称: XXX 或 字段名称: 具体值
|
||
pattern1 = rf"({re.escape(field_name)}[::]\s*)([^\n\r{{]+?)(\s|$|\n|\r|,|。)"
|
||
def replace_func1(match):
|
||
value = match.group(2).strip()
|
||
if value and not value.startswith("{{") and value not in ["——", "—", "-", ""]:
|
||
return f"{match.group(1)}{{{{{field_code}}}}}{match.group(3)}"
|
||
return match.group(0)
|
||
text = re.sub(pattern1, replace_func1, text)
|
||
|
||
# 模式2: 直接替换常见的占位符(XXX)
|
||
pattern2 = rf"({re.escape(field_name)}[::]\s*)(XXX|xxx|待填|待填写)"
|
||
text = re.sub(pattern2, rf"\1{{{{{field_code}}}}}", text)
|
||
break
|
||
|
||
if text != original_text:
|
||
paragraph.clear()
|
||
paragraph.add_run(text)
|
||
replacement_count += 1
|
||
|
||
# 确保输出目录存在
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 保存文档
|
||
doc.save(str(output_path))
|
||
|
||
# 输出统计信息
|
||
if replacement_count > 0 or ai_replacement_count > 0:
|
||
msg = f" ✓ 处理成功"
|
||
if ai_replacement_count > 0:
|
||
msg += f",AI识别 {ai_replacement_count} 处"
|
||
if replacement_count > 0:
|
||
msg += f",规则匹配 {replacement_count} 处"
|
||
print(msg)
|
||
else:
|
||
print(f" ⚠ 处理完成,但未找到需要替换的内容(可能已包含占位符)")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f" ✗ 处理失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
def process_all_templates():
|
||
"""
|
||
处理所有原始模板文件
|
||
"""
|
||
print("="*80)
|
||
print("开始处理原始模板文档")
|
||
print("="*80)
|
||
|
||
if not ORIGINAL_TEMPLATES_DIR.exists():
|
||
print(f"错误: 原始模板目录不存在: {ORIGINAL_TEMPLATES_DIR}")
|
||
return
|
||
|
||
# 统计信息
|
||
processed_count = 0
|
||
skipped_count = 0
|
||
failed_count = 0
|
||
|
||
# 遍历所有文件
|
||
for root, dirs, files in os.walk(ORIGINAL_TEMPLATES_DIR):
|
||
for file in files:
|
||
# 只处理.doc和.docx文件
|
||
if not file.endswith(('.doc', '.docx')):
|
||
continue
|
||
|
||
input_path = Path(root) / file
|
||
|
||
# 识别文档类型
|
||
doc_config = identify_document_type(file)
|
||
|
||
if not doc_config:
|
||
print(f"\n⚠ 无法识别文档类型: {file}")
|
||
print(f" 路径: {input_path}")
|
||
skipped_count += 1
|
||
continue
|
||
|
||
# 生成输出路径(保持相对目录结构)
|
||
relative_path = input_path.relative_to(ORIGINAL_TEMPLATES_DIR)
|
||
output_path = OUTPUT_TEMPLATES_DIR / relative_path.parent / f"{Path(file).stem}.docx"
|
||
|
||
print(f"\n处理: {file}")
|
||
print(f" 类型: {doc_config.get('template_code', 'UNKNOWN')}")
|
||
print(f" 输出: {output_path}")
|
||
|
||
# 处理文档(使用AI分析)
|
||
if process_document(input_path, output_path, doc_config, use_ai=True):
|
||
processed_count += 1
|
||
else:
|
||
failed_count += 1
|
||
|
||
# 输出统计信息
|
||
print("\n" + "="*80)
|
||
print("处理完成")
|
||
print("="*80)
|
||
print(f"成功处理: {processed_count} 个文件")
|
||
print(f"跳过: {skipped_count} 个文件")
|
||
print(f"失败: {failed_count} 个文件")
|
||
print(f"\n处理后的模板保存在: {OUTPUT_TEMPLATES_DIR}")
|
||
print("\n请检查生成的模板文件,确认占位符是否正确添加。")
|
||
print("如有需要,请手动调整占位符位置。")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
process_all_templates()
|