1419 lines
67 KiB
Python
1419 lines
67 KiB
Python
"""
|
||
AI服务 - 封装大模型调用
|
||
仅支持华为大模型
|
||
"""
|
||
import os
|
||
import re
|
||
import time
|
||
import requests
|
||
import json
|
||
from typing import Dict, List, Optional
|
||
|
||
# 尝试导入json-repair库,如果不可用则使用备用方案
|
||
try:
|
||
from json_repair import repair_json
|
||
JSONREPAIR_AVAILABLE = True
|
||
except ImportError:
|
||
JSONREPAIR_AVAILABLE = False
|
||
repair_json = None
|
||
print("[AI服务] 警告: json-repair库未安装,将使用基础JSON修复功能。建议运行: pip install json-repair")
|
||
|
||
# 导入AI日志记录器
|
||
try:
|
||
from services.ai_logger import get_ai_logger
|
||
AI_LOGGER_AVAILABLE = True
|
||
except ImportError:
|
||
AI_LOGGER_AVAILABLE = False
|
||
print("[AI服务] 警告: AI日志记录器未找到,将不记录对话日志")
|
||
|
||
|
||
class AIService:
|
||
"""AI服务类"""
|
||
|
||
def __init__(self):
|
||
# 华为大模型配置(必需)
|
||
self.huawei_api_endpoint = os.getenv('HUAWEI_API_ENDPOINT', 'http://10.100.31.26:3001/v1/chat/completions')
|
||
self.huawei_api_key = os.getenv('HUAWEI_API_KEY', 'sk-PoeiV3qwyTIRqcVc84E8E24cD2904872859a87922e0d9186')
|
||
self.huawei_model = os.getenv('HUAWEI_MODEL', 'DeepSeek-R1-Distill-Llama-70B')
|
||
|
||
# API超时配置(秒)
|
||
# 开启思考模式时,响应时间会显著增加,需要更长的超时时间
|
||
# 可以通过环境变量 HUAWEI_API_TIMEOUT 自定义,默认180秒(3分钟)
|
||
self.api_timeout = int(os.getenv('HUAWEI_API_TIMEOUT', '180'))
|
||
|
||
# API最大token数配置
|
||
# 开启思考模式时,模型可能生成更长的响应,需要更多的token
|
||
# 可以通过环境变量 HUAWEI_API_MAX_TOKENS 自定义,默认12000
|
||
self.api_max_tokens = int(os.getenv('HUAWEI_API_MAX_TOKENS', '12000'))
|
||
|
||
# 确定使用的AI服务
|
||
self.ai_provider = self._determine_ai_provider()
|
||
|
||
# 初始化AI日志记录器
|
||
if AI_LOGGER_AVAILABLE:
|
||
try:
|
||
self.ai_logger = get_ai_logger()
|
||
except Exception as e:
|
||
print(f"[AI服务] 初始化日志记录器失败: {e}")
|
||
self.ai_logger = None
|
||
else:
|
||
self.ai_logger = None
|
||
|
||
def _determine_ai_provider(self) -> str:
|
||
"""确定使用的AI服务提供商(仅支持华为大模型)"""
|
||
if self.huawei_api_endpoint and self.huawei_api_key:
|
||
return 'huawei'
|
||
else:
|
||
return 'none'
|
||
|
||
def extract_fields(self, prompt: str, output_fields: List[Dict]) -> Optional[Dict]:
|
||
"""
|
||
从提示词中提取结构化字段
|
||
|
||
Args:
|
||
prompt: AI提示词
|
||
output_fields: 输出字段列表
|
||
|
||
Returns:
|
||
提取的字段字典,格式: {field_code: field_value}
|
||
"""
|
||
if self.ai_provider == 'none':
|
||
raise Exception("未配置华为大模型服务,请设置 HUAWEI_API_KEY 和 HUAWEI_API_ENDPOINT")
|
||
|
||
if self.ai_provider == 'huawei':
|
||
return self._extract_with_huawei(prompt, output_fields)
|
||
else:
|
||
raise Exception(f"未知的AI服务提供商: {self.ai_provider}")
|
||
|
||
def _extract_with_siliconflow(self, prompt: str, output_fields: List[Dict]) -> Optional[Dict]:
|
||
"""
|
||
使用硅基流动API提取字段(已不再使用,仅保留用于参考)
|
||
系统仅支持华为大模型,不再支持自动回退
|
||
"""
|
||
try:
|
||
payload = {
|
||
"model": self.siliconflow_model,
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": "你是一个专业的数据提取助手,能够从文本中准确提取结构化信息。请严格按照JSON格式返回结果。"
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": prompt
|
||
}
|
||
],
|
||
"temperature": 0.3,
|
||
"max_tokens": 2000
|
||
}
|
||
|
||
headers = {
|
||
"Authorization": f"Bearer {self.siliconflow_api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
response = requests.post(
|
||
self.siliconflow_url,
|
||
json=payload,
|
||
headers=headers,
|
||
timeout=30
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
raise Exception(f"API调用失败: {response.status_code} - {response.text}")
|
||
|
||
result = response.json()
|
||
|
||
# 提取AI返回的内容
|
||
if 'choices' in result and len(result['choices']) > 0:
|
||
content = result['choices'][0]['message']['content']
|
||
|
||
# 尝试解析JSON
|
||
try:
|
||
# 如果返回的是代码块,提取JSON部分
|
||
if '```json' in content:
|
||
json_start = content.find('```json') + 7
|
||
json_end = content.find('```', json_start)
|
||
content = content[json_start:json_end].strip()
|
||
elif '```' in content:
|
||
json_start = content.find('```') + 3
|
||
json_end = content.find('```', json_start)
|
||
content = content[json_start:json_end].strip()
|
||
|
||
extracted_data = json.loads(content)
|
||
return extracted_data
|
||
except json.JSONDecodeError:
|
||
# 如果不是JSON,尝试从文本中提取
|
||
return self._parse_text_response(content, output_fields)
|
||
else:
|
||
raise Exception("API返回格式异常")
|
||
|
||
except requests.exceptions.Timeout:
|
||
raise Exception("AI服务调用超时")
|
||
except Exception as e:
|
||
raise Exception(f"AI服务调用失败: {str(e)}")
|
||
|
||
def _extract_with_huawei(self, prompt: str, output_fields: List[Dict]) -> Optional[Dict]:
|
||
"""
|
||
使用华为大模型API提取字段(带重试机制)
|
||
至少重试3次,总共最多尝试4次
|
||
"""
|
||
max_retries = 3 # 最多重试3次,总共4次尝试
|
||
retry_delay = 2 # 重试延迟(秒),每次重试延迟递增(从2秒开始)
|
||
|
||
last_exception = None
|
||
|
||
for attempt in range(max_retries + 1): # 0, 1, 2, 3 (总共4次)
|
||
try:
|
||
if attempt > 0:
|
||
# 重试前等待,延迟时间递增(2秒、4秒、6秒)
|
||
wait_time = retry_delay * attempt
|
||
print(f"[AI服务] 第 {attempt} 次重试,等待 {wait_time} 秒后重试...")
|
||
time.sleep(wait_time)
|
||
|
||
print(f"[AI服务] 正在调用华为大模型API (尝试 {attempt + 1}/{max_retries + 1})...")
|
||
result = self._call_huawei_api_once(prompt, output_fields)
|
||
|
||
if result is not None:
|
||
if attempt > 0:
|
||
print(f"[AI服务] 重试成功!")
|
||
return result
|
||
|
||
except requests.exceptions.Timeout as e:
|
||
last_exception = e
|
||
error_msg = f"AI服务调用超时 (尝试 {attempt + 1}/{max_retries + 1})"
|
||
print(f"[AI服务] {error_msg}")
|
||
if attempt < max_retries:
|
||
continue
|
||
else:
|
||
raise Exception(f"{error_msg}: {str(e)}")
|
||
|
||
except requests.exceptions.ConnectionError as e:
|
||
last_exception = e
|
||
error_msg = f"连接错误 (尝试 {attempt + 1}/{max_retries + 1})"
|
||
print(f"[AI服务] {error_msg}: {str(e)}")
|
||
if attempt < max_retries:
|
||
continue
|
||
else:
|
||
raise Exception(f"{error_msg}: {str(e)}")
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
last_exception = e
|
||
error_msg = f"请求异常 (尝试 {attempt + 1}/{max_retries + 1})"
|
||
print(f"[AI服务] {error_msg}: {str(e)}")
|
||
if attempt < max_retries:
|
||
continue
|
||
else:
|
||
raise Exception(f"{error_msg}: {str(e)}")
|
||
|
||
except Exception as e:
|
||
last_exception = e
|
||
error_msg = f"AI服务调用失败 (尝试 {attempt + 1}/{max_retries + 1})"
|
||
print(f"[AI服务] {error_msg}: {str(e)}")
|
||
# 对于其他类型的错误,也进行重试
|
||
if attempt < max_retries:
|
||
continue
|
||
else:
|
||
raise Exception(f"{error_msg}: {str(e)}")
|
||
|
||
# 如果所有重试都失败了
|
||
if last_exception:
|
||
raise Exception(f"AI服务调用失败,已重试 {max_retries} 次: {str(last_exception)}")
|
||
else:
|
||
raise Exception(f"AI服务调用失败,已重试 {max_retries} 次")
|
||
|
||
def _call_huawei_api_once(self, prompt: str, output_fields: List[Dict]) -> Optional[Dict]:
|
||
"""
|
||
单次调用华为大模型API(不包含重试逻辑)
|
||
"""
|
||
# 生成会话ID(用于关联同一次调用的请求和响应)
|
||
session_id = f"session_{int(time.time() * 1000)}"
|
||
|
||
payload = {
|
||
"model": self.huawei_model,
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": "你是一个专业的数据提取助手。请从输入文本中提取结构化信息,并严格按照JSON格式返回结果。\n\n核心要求:\n1. 仔细阅读输入文本,提取所有相关信息\n2. 如果文本中明确提到信息(如性别、年龄、职务、职级等),必须提取,不能设为空\n3. 性别字段:识别\"男\"、\"女\"、\"男性\"、\"女性\"等词汇,统一转换为\"男\"或\"女\"\n4. 只返回JSON对象,不要包含任何其他文字、思考过程或markdown标记\n5. 字段名必须严格按照示例格式,使用正确的字段编码:\n - 使用\"target_professional_rank\",不要使用\"_professional_rank\"\n - 使用\"clue_source\",不要使用\"_source\"或\"source\"\n - 使用\"target_organization\",不要使用\"target_organisation\"\n6. JSON格式必须完整且有效,所有字段名使用双引号"
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": prompt
|
||
}
|
||
],
|
||
"stream": False,
|
||
"presence_penalty": 1.05, # 提高presence_penalty,鼓励模型提取更多不同字段
|
||
"frequency_penalty": 1.02, # 提高frequency_penalty,减少重复
|
||
"repetition_penalty": 1.05, # 提高repetition_penalty,避免重复
|
||
"temperature": 0.2, # 降低temperature,提高确定性
|
||
"top_p": 0.9, # 降低top_p,更聚焦
|
||
"top_k": 40, # 增加top_k,允许更多选择
|
||
"seed": 1,
|
||
"max_tokens": self.api_max_tokens,
|
||
"n": 1,
|
||
"enable_thinking": False # 关闭思考模式以提高JSON生成稳定性
|
||
}
|
||
|
||
headers = {
|
||
"Authorization": f"Bearer {self.huawei_api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
# 记录请求信息(发送请求前)
|
||
api_request_info = {
|
||
"endpoint": self.huawei_api_endpoint,
|
||
"model": self.huawei_model,
|
||
"messages": payload["messages"],
|
||
"temperature": payload.get("temperature"),
|
||
"max_tokens": payload.get("max_tokens"),
|
||
"enable_thinking": payload.get("enable_thinking", False),
|
||
}
|
||
if self.ai_logger:
|
||
self.ai_logger.log_request_only(prompt, api_request_info, session_id)
|
||
|
||
# 根据是否开启思考模式动态调整超时时间
|
||
# 开启思考模式时,模型需要更多时间进行推理,超时时间需要更长
|
||
enable_thinking = payload.get('enable_thinking', False)
|
||
if enable_thinking:
|
||
# 思考模式:使用配置的超时时间(默认180秒)
|
||
timeout = self.api_timeout
|
||
print(f"[AI服务] 思考模式已开启,使用超时时间: {timeout}秒")
|
||
else:
|
||
# 非思考模式:使用较短的超时时间
|
||
timeout = min(self.api_timeout, 120) # 最多120秒
|
||
print(f"[AI服务] 思考模式未开启,使用超时时间: {timeout}秒")
|
||
|
||
extracted_data = None
|
||
error_message = None
|
||
|
||
try:
|
||
response = requests.post(
|
||
self.huawei_api_endpoint,
|
||
json=payload,
|
||
headers=headers,
|
||
timeout=timeout
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
error_message = f"API调用失败: {response.status_code} - {response.text}"
|
||
# 记录错误
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=None,
|
||
extracted_data=None,
|
||
error=error_message,
|
||
session_id=session_id
|
||
)
|
||
raise Exception(error_message)
|
||
|
||
result = response.json()
|
||
|
||
# 提取AI返回的内容
|
||
if 'choices' in result and len(result['choices']) > 0:
|
||
raw_content = result['choices'][0]['message']['content']
|
||
|
||
# 调试:打印原始返回内容(前500字符)
|
||
print(f"[AI服务] API返回的原始内容(前500字符): {raw_content[:500]}")
|
||
|
||
# 处理思考过程标签(支持多种可能的标签格式)
|
||
content = raw_content
|
||
|
||
# 处理 </think> 标签(DeepSeek-R1常用格式)
|
||
if '</think>' in content:
|
||
parts = content.split('</think>')
|
||
if len(parts) > 1:
|
||
content = parts[-1].strip()
|
||
print(f"[AI服务] 检测到 </think> 标签,提取标签后的内容")
|
||
|
||
# 处理 </think> 标签
|
||
elif '</think>' in content:
|
||
parts = content.split('</think>')
|
||
if len(parts) > 1:
|
||
content = parts[-1].strip()
|
||
print(f"[AI服务] 检测到 </think> 标签,提取标签后的内容")
|
||
|
||
# 处理 <reasoning>...</reasoning> 标签
|
||
elif '<reasoning>' in content and '</reasoning>' in content:
|
||
reasoning_start = content.find('</reasoning>')
|
||
if reasoning_start != -1:
|
||
content = content[reasoning_start + 11:].strip()
|
||
print(f"[AI服务] 检测到 <reasoning> 标签,提取标签后的内容")
|
||
|
||
# 清理后的内容(前500字符)
|
||
print(f"[AI服务] 清理后的内容(前500字符): {content[:500]}")
|
||
|
||
# 尝试解析JSON(使用增强的修复机制)
|
||
extracted_data = self._extract_json_from_text(content)
|
||
if extracted_data:
|
||
print(f"[AI服务] JSON解析成功,提取到 {len(extracted_data)} 个字段")
|
||
print(f"[AI服务] 原始字段名: {list(extracted_data.keys())}")
|
||
# 规范化字段名并映射到正确的字段编码
|
||
normalized_data = self._normalize_field_names(extracted_data, output_fields)
|
||
print(f"[AI服务] 规范化后的字段名: {list(normalized_data.keys())}")
|
||
# 打印关键字段的值用于调试
|
||
for key in ['target_name', 'target_gender', 'target_age', 'target_date_of_birth']:
|
||
if key in normalized_data:
|
||
print(f"[AI服务] 规范化后 {key} = '{normalized_data[key]}'")
|
||
# 规范化日期格式
|
||
normalized_data = self._normalize_date_formats(normalized_data, output_fields)
|
||
# 再次打印关键字段的值用于调试
|
||
for key in ['target_name', 'target_gender', 'target_age', 'target_date_of_birth']:
|
||
if key in normalized_data:
|
||
print(f"[AI服务] 日期格式化后 {key} = '{normalized_data[key]}'")
|
||
# 后处理:从已有信息推断缺失字段
|
||
normalized_data = self._post_process_inferred_fields(normalized_data, output_fields)
|
||
# 打印后处理后的关键字段
|
||
for key in ['target_name', 'target_gender', 'target_age', 'target_date_of_birth', 'target_organization', 'target_position']:
|
||
if key in normalized_data:
|
||
print(f"[AI服务] 后处理后 {key} = '{normalized_data[key]}'")
|
||
# 即使提取的字段不完整,也返回结果(更宽容的处理)
|
||
if any(v for v in normalized_data.values() if v): # 至少有一个非空字段
|
||
print(f"[AI服务] 返回提取的数据(包含 {sum(1 for v in normalized_data.values() if v)} 个非空字段)")
|
||
# 记录成功的对话
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=result,
|
||
extracted_data=normalized_data,
|
||
error=None,
|
||
session_id=session_id
|
||
)
|
||
return normalized_data
|
||
else:
|
||
print(f"[AI服务] 警告:提取的数据全部为空,但继续返回(允许部分字段为空)")
|
||
# 记录对话(即使数据为空)
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=result,
|
||
extracted_data=normalized_data,
|
||
error="提取的数据全部为空",
|
||
session_id=session_id
|
||
)
|
||
return normalized_data
|
||
|
||
# 如果无法提取JSON,记录错误但尝试更宽容的处理
|
||
print(f"[AI服务] 警告:无法从内容中提取完整JSON,尝试备用解析方法")
|
||
print(f"[AI服务] 清理后的内容(前500字符): {content[:500]}")
|
||
|
||
# 尝试从文本中提取
|
||
parsed_data = self._parse_text_response(content, output_fields)
|
||
if parsed_data and any(v for v in parsed_data.values() if v): # 至少有一个非空字段
|
||
print(f"[AI服务] 使用备用方法解析成功,提取到 {len(parsed_data)} 个字段")
|
||
# 记录对话
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=result,
|
||
extracted_data=parsed_data,
|
||
error=None,
|
||
session_id=session_id
|
||
)
|
||
return parsed_data
|
||
|
||
# 如果所有方法都失败,尝试最后一次修复尝试
|
||
print(f"[AI服务] 所有解析方法都失败,尝试最后一次修复...")
|
||
# 尝试使用jsonrepair(如果可用)进行最后修复
|
||
if JSONREPAIR_AVAILABLE:
|
||
try:
|
||
repaired_content = repair_json(content)
|
||
if repaired_content:
|
||
try:
|
||
extracted_data = json.loads(repaired_content)
|
||
if extracted_data and isinstance(extracted_data, dict):
|
||
print(f"[AI服务] 使用jsonrepair最后修复成功,提取到 {len(extracted_data)} 个字段")
|
||
normalized_data = self._normalize_field_names(extracted_data, output_fields)
|
||
normalized_data = self._normalize_date_formats(normalized_data, output_fields)
|
||
normalized_data = self._post_process_inferred_fields(normalized_data, output_fields)
|
||
# 记录对话
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=result,
|
||
extracted_data=normalized_data,
|
||
error=None,
|
||
session_id=session_id
|
||
)
|
||
return normalized_data
|
||
except json.JSONDecodeError:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[AI服务] jsonrepair最后修复也失败: {e}")
|
||
|
||
# 如果所有方法都失败,返回空字典而不是抛出异常(更宽容)
|
||
# 这样至少不会导致整个调用失败,前端可以显示部分结果
|
||
error_msg = f"无法从API返回内容中提取JSON数据。原始内容长度: {len(raw_content)}, 清理后内容长度: {len(content)}"
|
||
print(f"[AI服务] 警告:{error_msg}")
|
||
print(f"[AI服务] 完整内容: {content}")
|
||
# 返回一个包含所有输出字段的空字典,而不是抛出异常
|
||
empty_result = {field['field_code']: '' for field in output_fields}
|
||
print(f"[AI服务] 返回空结果(包含 {len(empty_result)} 个字段,全部为空)")
|
||
# 记录失败的对话
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=result,
|
||
extracted_data=empty_result,
|
||
error=error_msg,
|
||
session_id=session_id
|
||
)
|
||
return empty_result
|
||
else:
|
||
error_msg = "API返回格式异常:未找到choices字段或choices为空"
|
||
# 记录错误
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=result,
|
||
extracted_data=None,
|
||
error=error_msg,
|
||
session_id=session_id
|
||
)
|
||
raise Exception(error_msg)
|
||
|
||
except Exception as e:
|
||
# 如果发生异常,记录错误日志
|
||
error_msg = str(e)
|
||
if self.ai_logger:
|
||
self.ai_logger.log_conversation(
|
||
prompt=prompt,
|
||
api_request=api_request_info,
|
||
api_response=None,
|
||
extracted_data=None,
|
||
error=error_msg,
|
||
session_id=session_id
|
||
)
|
||
# 重新抛出异常,让上层处理
|
||
raise
|
||
|
||
def _extract_json_from_text(self, text: str) -> Optional[Dict]:
|
||
"""
|
||
从文本中提取JSON对象
|
||
支持多种格式:
|
||
1. 纯JSON对象
|
||
2. 包裹在 ```json 代码块中的JSON
|
||
3. 包裹在 ``` 代码块中的JSON
|
||
4. 文本中包含的JSON对象
|
||
"""
|
||
# 方法1: 尝试提取代码块中的JSON
|
||
if '```json' in text:
|
||
json_start = text.find('```json') + 7
|
||
json_end = text.find('```', json_start)
|
||
if json_end != -1:
|
||
json_str = text[json_start:json_end].strip()
|
||
# 尝试清理和修复JSON
|
||
json_str = self._clean_json_string(json_str)
|
||
try:
|
||
return json.loads(json_str)
|
||
except json.JSONDecodeError as e:
|
||
print(f"[AI服务] JSON解析失败(代码块): {e}")
|
||
# 尝试使用jsonrepair修复(如果可用)
|
||
if JSONREPAIR_AVAILABLE:
|
||
try:
|
||
repaired = repair_json(json_str)
|
||
if repaired:
|
||
return json.loads(repaired)
|
||
except Exception as repair_error:
|
||
print(f"[AI服务] jsonrepair修复失败: {repair_error}")
|
||
# 尝试基础修复后再次解析
|
||
json_str = self._fix_json_string(json_str)
|
||
try:
|
||
return json.loads(json_str)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
if '```' in text:
|
||
json_start = text.find('```') + 3
|
||
json_end = text.find('```', json_start)
|
||
if json_end != -1:
|
||
json_str = text[json_start:json_end].strip()
|
||
# 尝试清理和修复JSON
|
||
json_str = self._clean_json_string(json_str)
|
||
try:
|
||
return json.loads(json_str)
|
||
except json.JSONDecodeError as e:
|
||
print(f"[AI服务] JSON解析失败(代码块): {e}")
|
||
# 尝试使用jsonrepair修复(如果可用)
|
||
if JSONREPAIR_AVAILABLE:
|
||
try:
|
||
repaired = repair_json(json_str)
|
||
if repaired:
|
||
return json.loads(repaired)
|
||
except Exception as repair_error:
|
||
print(f"[AI服务] jsonrepair修复失败: {repair_error}")
|
||
# 尝试基础修复后再次解析
|
||
json_str = self._fix_json_string(json_str)
|
||
try:
|
||
return json.loads(json_str)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法2: 尝试直接解析整个文本
|
||
cleaned_text = self._clean_json_string(text.strip())
|
||
try:
|
||
return json.loads(cleaned_text)
|
||
except json.JSONDecodeError as e:
|
||
print(f"[AI服务] JSON解析失败(直接解析): {e}")
|
||
# 尝试使用jsonrepair修复(如果可用)
|
||
if JSONREPAIR_AVAILABLE:
|
||
try:
|
||
repaired = repair_json(cleaned_text)
|
||
if repaired:
|
||
return json.loads(repaired)
|
||
except Exception as repair_error:
|
||
print(f"[AI服务] jsonrepair修复失败: {repair_error}")
|
||
# 尝试基础修复后再次解析
|
||
fixed_text = self._fix_json_string(cleaned_text)
|
||
try:
|
||
return json.loads(fixed_text)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法3: 尝试查找文本中的JSON对象(以 { 开始,以 } 结束)
|
||
# 使用正则表达式找到最外层的JSON对象
|
||
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
|
||
matches = re.finditer(json_pattern, text, re.DOTALL)
|
||
|
||
for match in matches:
|
||
json_str = match.group(0)
|
||
try:
|
||
data = json.loads(json_str)
|
||
# 验证是否包含预期的字段(至少有一个输出字段的key)
|
||
if isinstance(data, dict) and len(data) > 0:
|
||
return data
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
# 方法4: 尝试查找嵌套的JSON对象(更复杂的匹配)
|
||
# 找到第一个 { 和最后一个匹配的 }
|
||
start_idx = text.find('{')
|
||
if start_idx != -1:
|
||
brace_count = 0
|
||
end_idx = start_idx
|
||
for i in range(start_idx, len(text)):
|
||
if text[i] == '{':
|
||
brace_count += 1
|
||
elif text[i] == '}':
|
||
brace_count -= 1
|
||
if brace_count == 0:
|
||
end_idx = i
|
||
break
|
||
|
||
if end_idx > start_idx:
|
||
json_str = text[start_idx:end_idx + 1]
|
||
try:
|
||
return json.loads(json_str)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法5: 如果所有方法都失败,尝试部分提取(即使JSON不完整)
|
||
partial_data = self._extract_partial_json(text)
|
||
if partial_data:
|
||
print(f"[AI服务] 使用部分JSON提取,提取到 {len(partial_data)} 个字段")
|
||
return partial_data
|
||
|
||
return None
|
||
|
||
def _extract_partial_json(self, text: str) -> Optional[Dict]:
|
||
"""
|
||
从可能不完整的JSON中提取可用字段
|
||
即使JSON格式不完整,也尝试提取能够解析的字段
|
||
"""
|
||
result = {}
|
||
|
||
# 尝试找到JSON对象的开始位置
|
||
start_idx = text.find('{')
|
||
if start_idx == -1:
|
||
return None
|
||
|
||
# 提取从{开始的内容
|
||
json_content = text[start_idx:]
|
||
|
||
# 尝试使用正则表达式提取键值对
|
||
# 匹配模式: "key": "value" 或 "key": value
|
||
# 处理各种可能的格式错误
|
||
|
||
# 模式1: "key": "value"
|
||
pattern1 = r'"([^"]+?)"\s*:\s*"([^"]*?)"'
|
||
matches1 = re.finditer(pattern1, json_content, re.DOTALL)
|
||
for match in matches1:
|
||
key = match.group(1)
|
||
value = match.group(2)
|
||
# 清理键名(移除可能的转义字符)
|
||
key = key.replace('\\"', '').replace('\\', '').strip()
|
||
if key:
|
||
# 处理以_开头的字段名(如_professional_rank -> professional_rank)
|
||
original_key = key
|
||
if key.startswith('_') and len(key) > 1:
|
||
# 特殊处理:_source -> clue_source
|
||
if key == '_source':
|
||
key = 'clue_source'
|
||
print(f"[AI服务] 部分JSON提取:修复字段名 '{original_key}' -> '{key}'")
|
||
else:
|
||
key = key[1:]
|
||
print(f"[AI服务] 部分JSON提取:处理下划线前缀 '{original_key}' -> '{key}'")
|
||
# 修复常见字段名错误
|
||
if key == 'target_organisation':
|
||
key = 'target_organization'
|
||
print(f"[AI服务] 部分JSON提取:修复拼写错误 'target_organisation' -> 'target_organization'")
|
||
if key not in result: # 避免覆盖已有值
|
||
result[key] = value
|
||
print(f"[AI服务] 部分JSON提取:提取字段 '{key}' = '{value}'")
|
||
|
||
# 模式2: "key": value (非字符串值,如数字、布尔值)
|
||
pattern2 = r'"([^"]+?)"\s*:\s*([^,}\]]+?)(?=\s*[,}\]])'
|
||
matches2 = re.finditer(pattern2, json_content, re.DOTALL)
|
||
for match in matches2:
|
||
key = match.group(1).strip()
|
||
value_str = match.group(2).strip()
|
||
# 清理键名
|
||
key = key.replace('\\"', '').replace('\\', '').strip()
|
||
if key and key not in result: # 避免覆盖已有值
|
||
# 处理以_开头的字段名
|
||
original_key = key
|
||
if key.startswith('_') and len(key) > 1:
|
||
# 特殊处理:_source -> clue_source
|
||
if key == '_source':
|
||
key = 'clue_source'
|
||
print(f"[AI服务] 部分JSON提取:修复字段名 '{original_key}' -> '{key}'")
|
||
else:
|
||
key = key[1:]
|
||
print(f"[AI服务] 部分JSON提取:处理下划线前缀 '{original_key}' -> '{key}'")
|
||
# 修复常见字段名错误
|
||
if key == 'target_organisation':
|
||
key = 'target_organization'
|
||
print(f"[AI服务] 部分JSON提取:修复拼写错误 'target_organisation' -> 'target_organization'")
|
||
# 尝试解析值
|
||
if value_str.lower() in ('true', 'false'):
|
||
result[key] = value_str.lower() == 'true'
|
||
elif value_str.lower() == 'null':
|
||
result[key] = None
|
||
elif value_str.replace('.', '', 1).replace('-', '', 1).isdigit():
|
||
try:
|
||
if '.' in value_str:
|
||
result[key] = float(value_str)
|
||
else:
|
||
result[key] = int(value_str)
|
||
except ValueError:
|
||
result[key] = value_str
|
||
else:
|
||
result[key] = value_str
|
||
print(f"[AI服务] 部分JSON提取:提取字段 '{key}' = '{result[key]}'")
|
||
|
||
# 模式3: 处理字段名缺少引号的情况(如 key: "value" 或 _key: "value")
|
||
pattern3 = r'([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*"([^"]*?)"'
|
||
matches3 = re.finditer(pattern3, json_content, re.DOTALL)
|
||
for match in matches3:
|
||
key = match.group(1).strip()
|
||
value = match.group(2)
|
||
if key and key not in result: # 避免覆盖已有值
|
||
# 处理以_开头的字段名
|
||
original_key = key
|
||
if key.startswith('_') and len(key) > 1:
|
||
# 特殊处理:_source -> clue_source
|
||
if key == '_source':
|
||
key = 'clue_source'
|
||
print(f"[AI服务] 部分JSON提取:修复字段名 '{original_key}' -> '{key}'")
|
||
else:
|
||
key = key[1:]
|
||
print(f"[AI服务] 部分JSON提取:处理下划线前缀 '{original_key}' -> '{key}'")
|
||
# 修复常见字段名错误
|
||
if key == 'target_organisation':
|
||
key = 'target_organization'
|
||
print(f"[AI服务] 部分JSON提取:修复拼写错误 'target_organisation' -> 'target_organization'")
|
||
result[key] = value
|
||
print(f"[AI服务] 部分JSON提取:提取字段 '{key}' = '{value}'")
|
||
|
||
# 如果提取到了字段,返回结果
|
||
if result:
|
||
return result
|
||
|
||
return None
|
||
|
||
def _clean_json_string(self, json_str: str) -> str:
|
||
"""
|
||
清理JSON字符串,移除常见的格式问题
|
||
"""
|
||
# 移除前导/尾随空白
|
||
json_str = json_str.strip()
|
||
|
||
# 移除可能的BOM标记
|
||
if json_str.startswith('\ufeff'):
|
||
json_str = json_str[1:]
|
||
|
||
# 移除可能的XML/HTML标签残留
|
||
json_str = re.sub(r'<[^>]+>', '', json_str)
|
||
|
||
return json_str
|
||
|
||
def _fix_json_string(self, json_str: str) -> str:
|
||
"""
|
||
尝试修复常见的JSON格式错误
|
||
增强版:处理反斜杠、控制字符、不完整JSON等问题
|
||
"""
|
||
# 首先尝试使用jsonrepair库(如果可用)
|
||
if JSONREPAIR_AVAILABLE:
|
||
try:
|
||
repaired = repair_json(json_str)
|
||
if repaired and repaired != json_str:
|
||
print(f"[AI服务] 使用jsonrepair修复JSON成功")
|
||
return repaired
|
||
except Exception as e:
|
||
print(f"[AI服务] jsonrepair修复失败: {e},使用基础修复方法")
|
||
|
||
# 基础修复方法
|
||
# 1. 移除控制字符(除了换行符、制表符等)
|
||
# 保留换行符(\n)、回车符(\r)、制表符(\t),移除其他控制字符
|
||
json_str = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', json_str)
|
||
|
||
# 2. 修复字段名中的反斜杠转义问题(如 "_professional_rank\" -> "_professional_rank")
|
||
# 处理字段名前的反斜杠(多次处理,确保修复所有情况)
|
||
json_str = re.sub(r'\\"([^"]+?)\\":', r'"\1":', json_str)
|
||
json_str = re.sub(r'\\"([^"]+?)\\":', r'"\1":', json_str) # 再次处理,确保修复所有情况
|
||
json_str = re.sub(r'\\"([^"]+?)\\":', r'"\1":', json_str) # 第三次处理,确保修复嵌套转义
|
||
|
||
# 2.1 修复字段名前后有转义字符和空格的情况(如 \\\" target_position \\\":)
|
||
json_str = re.sub(r'\\+["\']\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\\+["\']\s*:', r'"\1":', json_str)
|
||
json_str = re.sub(r'\\+["\']\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*["\']\s*:', r'"\1":', json_str)
|
||
json_str = re.sub(r'["\']\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\\+["\']\s*:', r'"\1":', json_str)
|
||
|
||
# 3. 修复字段名缺少开头引号的问题(如 _professional_rank" -> "_professional_rank")
|
||
json_str = re.sub(r'([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)"\s*:', r'\1"\2":', json_str)
|
||
|
||
# 4. 修复字段名缺少结尾引号的问题(如 "_professional_rank -> "_professional_rank")
|
||
json_str = re.sub(r'"([^"]+?)\s*:\s*"', r'"\1": "', json_str)
|
||
|
||
# 5. 修复字段名中的转义引号问题(如 \"_professional_rank\" -> "_professional_rank")
|
||
json_str = re.sub(r'\\"([^"]+?)\\"\s*:', r'"\1":', json_str)
|
||
|
||
# 5. 移除末尾的逗号(在 } 或 ] 之前)
|
||
json_str = re.sub(r',\s*}', '}', json_str)
|
||
json_str = re.sub(r',\s*]', ']', json_str)
|
||
|
||
# 6. 修复字段名中的错误(如 .target_gender -> target_gender)
|
||
# 处理前导点和尾随空格
|
||
json_str = re.sub(r'["\']\s*\.([^"\']+?)\s*["\']\s*:', r'"\1":', json_str)
|
||
json_str = re.sub(r'["\']\.([^"\']+?)["\']\s*:', r'"\1":', json_str)
|
||
|
||
# 7. 修复字段名中的空格(如 "target name" -> "target_name")
|
||
json_str = re.sub(r'["\']([^"\']+?)\s+([^"\']+?)["\']\s*:', r'"\1_\2":', json_str)
|
||
|
||
# 8. 修复字段名中的尾随空格(如 "target_gender " -> "target_gender")
|
||
json_str = re.sub(r'["\']([^"\']+?)\s+["\']\s*:', r'"\1":', json_str)
|
||
|
||
# 9. 修复字段名中的前导空格(如 " target_gender" -> "target_gender")
|
||
json_str = re.sub(r'["\']\s+([^"\']+?)["\']\s*:', r'"\1":', json_str)
|
||
|
||
# 10. 尝试修复未加引号的字段名(但要避免破坏字符串值)
|
||
# 只修复在冒号前的未加引号的标识符
|
||
json_str = re.sub(r'([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str)
|
||
|
||
# 11. 修复常见的字段名错误(基于日志中的实际错误)
|
||
# 11.1 修复 _source -> clue_source
|
||
json_str = re.sub(r'"_source"\s*:', '"clue_source":', json_str)
|
||
json_str = re.sub(r'"\\?_source"\s*:', '"clue_source":', json_str)
|
||
|
||
# 11.2 修复 target_organisation -> target_organization
|
||
json_str = re.sub(r'"target_organisation"\s*:', '"target_organization":', json_str)
|
||
json_str = re.sub(r'"target_organisation"\s*:', '"target_organization":', json_str, flags=re.IGNORECASE)
|
||
|
||
# 11.3 修复字段名中的下划线前缀错误(如 _professional_rank -> target_professional_rank)
|
||
# 注意:这里要小心,只在确认是字段名的情况下修复
|
||
json_str = re.sub(r'"_([a-z_]+_rank)"\s*:', r'"target_\1":', json_str)
|
||
json_str = re.sub(r'"_([a-z_]+_status)"\s*:', r'"target_\1":', json_str)
|
||
|
||
# 12. 修复值中的转义字符问题(如 \"total_manager, -> "总经理")
|
||
# 但这里要小心,不要破坏合法的转义序列
|
||
# 只修复明显错误的转义(如 \" 在值开头且后面跟着字母)
|
||
json_str = re.sub(r':\s*\\"([^"]+?),', r': "\1",', json_str)
|
||
|
||
# 13. 修复不完整的JSON结尾(如 \"\n} -> ""\n})
|
||
json_str = re.sub(r':\s*\\"\s*\n\s*}', ': ""\n}', json_str)
|
||
|
||
# 14. 修复字段名中的多余转义(多次处理)
|
||
# 处理 \\\" -> " 的情况
|
||
json_str = re.sub(r'\\+["\']', '"', json_str)
|
||
# 但保留字符串值中的合法转义(如 \n, \t 等)
|
||
# 这里需要更精细的处理,暂时先简单处理
|
||
|
||
# 15. 修复字符串值中的转义问题(但保留必要的转义)
|
||
# 这里要小心,不要破坏合法的转义序列
|
||
|
||
return json_str
|
||
|
||
def _normalize_field_names(self, extracted_data: Dict, output_fields: List[Dict]) -> Dict:
|
||
"""
|
||
规范化字段名,将模型返回的各种字段名格式映射到正确的字段编码
|
||
|
||
Args:
|
||
extracted_data: 模型返回的原始数据字典
|
||
output_fields: 输出字段列表,包含正确的字段编码
|
||
|
||
Returns:
|
||
规范化后的字段字典,使用正确的字段编码作为key
|
||
"""
|
||
# 创建字段编码到字段信息的映射
|
||
field_code_map = {field['field_code']: field for field in output_fields}
|
||
|
||
# 创建字段名到字段编码的映射(支持多种变体)
|
||
name_to_code_map = {}
|
||
for field in output_fields:
|
||
field_code = field['field_code']
|
||
field_name = field.get('name', '')
|
||
|
||
# 添加标准字段编码
|
||
name_to_code_map[field_code] = field_code
|
||
|
||
# 添加字段名(如果有)
|
||
if field_name:
|
||
name_to_code_map[field_name] = field_code
|
||
|
||
# 处理驼峰命名变体(如 politicalStatus -> target_political_status)
|
||
# 将 target_political_status 转换为可能的驼峰形式
|
||
if '_' in field_code:
|
||
parts = field_code.split('_')
|
||
# 生成驼峰形式:targetPoliticalStatus
|
||
camel_case = parts[0] + ''.join(word.capitalize() for word in parts[1:])
|
||
name_to_code_map[camel_case] = field_code
|
||
# 生成首字母大写的驼峰形式:TargetPoliticalStatus
|
||
pascal_case = ''.join(word.capitalize() for word in parts)
|
||
name_to_code_map[pascal_case] = field_code
|
||
|
||
# 处理去掉前缀的变体(如 name -> target_name)
|
||
if field_code.startswith('target_'):
|
||
short_name = field_code.replace('target_', '')
|
||
name_to_code_map[short_name] = field_code
|
||
# 驼峰形式:name -> target_name
|
||
camel_short = short_name.split('_')[0] + ''.join(word.capitalize() for word in short_name.split('_')[1:]) if '_' in short_name else short_name
|
||
name_to_code_map[camel_short] = field_code
|
||
|
||
# 添加常见的Schema.org格式字段名映射
|
||
schema_mapping = {
|
||
'name': 'target_name',
|
||
'gender': 'target_gender',
|
||
'dateOfBirth': 'target_date_of_birth',
|
||
'date_of_birth': 'target_date_of_birth',
|
||
'politicalStatus': 'target_political_status',
|
||
'political_status': 'target_political_status',
|
||
'organizationAndPosition': 'target_organization_and_position',
|
||
'organization_and_position': 'target_organization_and_position',
|
||
'organization': 'target_organization',
|
||
'position': 'target_position',
|
||
'educationLevel': 'target_education_level',
|
||
'education_level': 'target_education_level',
|
||
'professionalRank': 'target_professional_rank',
|
||
'professional_rank': 'target_professional_rank',
|
||
'clueSource': 'clue_source',
|
||
'clue_source': 'clue_source',
|
||
'source': 'clue_source', # 添加 source -> clue_source 的映射(处理 _source 去掉下划线后的情况)
|
||
'_source': 'clue_source', # 修复 _source -> clue_source(处理下划线前缀错误)
|
||
'issueDescription': 'target_issue_description',
|
||
'issue_description': 'target_issue_description',
|
||
'description': 'target_issue_description', # description可能是问题描述
|
||
'age': 'target_age',
|
||
}
|
||
# 添加Schema.org格式的映射(仅当字段编码存在时)
|
||
for schema_key, code in schema_mapping.items():
|
||
if code in field_code_map:
|
||
name_to_code_map[schema_key] = code
|
||
|
||
# 添加常见拼写错误的映射(如 targetsProfessionalRank -> target_professional_rank)
|
||
typo_mapping = {
|
||
'targetsProfessionalRank': 'target_professional_rank',
|
||
'targetProfessionalRank': 'target_professional_rank',
|
||
'targets_professional_rank': 'target_professional_rank',
|
||
'targetsProfessional': 'target_professional_rank',
|
||
'professionalRank': 'target_professional_rank',
|
||
'targetGender': 'target_gender',
|
||
'targetsGender': 'target_gender',
|
||
'targetDateOfBirth': 'target_date_of_birth',
|
||
'targetsDateOfBirth': 'target_date_of_birth',
|
||
'targetPoliticalStatus': 'target_political_status',
|
||
'targetsPoliticalStatus': 'target_political_status',
|
||
'targetOrganizationAndPosition': 'target_organization_and_position',
|
||
'targetsOrganizationAndPosition': 'target_organization_and_position',
|
||
'targetOrganization': 'target_organization',
|
||
'targetsOrganization': 'target_organization',
|
||
'targetPosition': 'target_position',
|
||
'targetsPosition': 'target_position',
|
||
'targetEducationLevel': 'target_education_level',
|
||
'targetsEducationLevel': 'target_education_level',
|
||
'targetAge': 'target_age',
|
||
'targetsAge': 'target_age',
|
||
'targetIssueDescription': 'target_issue_description',
|
||
'targetsIssueDescription': 'target_issue_description',
|
||
# 添加基于日志错误的映射
|
||
'_source': 'clue_source', # 修复 _source -> clue_source
|
||
'_professional_rank': 'target_professional_rank', # 修复 _professional_rank
|
||
'_status': 'target_political_status', # 修复 _status
|
||
'target_organisation': 'target_organization', # 修复英式拼写
|
||
'targetOrganisation': 'target_organization', # 修复英式拼写(驼峰)
|
||
}
|
||
# 添加拼写错误映射(仅当字段编码存在时)
|
||
for typo_key, code in typo_mapping.items():
|
||
if code in field_code_map:
|
||
name_to_code_map[typo_key] = code
|
||
|
||
# 规范化数据
|
||
normalized_data = {}
|
||
|
||
for key, value in extracted_data.items():
|
||
# 跳过特殊字段(如 @context)
|
||
if key.startswith('@'):
|
||
continue
|
||
|
||
# 跳过空字段名
|
||
if not key or not key.strip():
|
||
print(f"[AI服务] 跳过空字段名,值为: '{value}'")
|
||
continue
|
||
|
||
# 处理嵌套对象(如 description: {violationOfFamilyPlanningPolicies: "..."})
|
||
if isinstance(value, dict):
|
||
# 尝试从嵌套对象中提取值
|
||
# 通常嵌套对象中只有一个值,取第一个非空值
|
||
nested_values = [v for v in value.values() if v and isinstance(v, str)]
|
||
if nested_values:
|
||
value = nested_values[0]
|
||
else:
|
||
# 如果嵌套对象中没有字符串值,尝试转换为字符串
|
||
value = str(value) if value else ''
|
||
|
||
# 清理字段名:去掉前导点、空格等
|
||
cleaned_key = key.strip().lstrip('.').rstrip()
|
||
|
||
# 如果清理后字段名为空,跳过
|
||
if not cleaned_key:
|
||
print(f"[AI服务] 跳过清理后为空字段名,原始key: '{key}', 值为: '{value}'")
|
||
continue
|
||
|
||
# 尝试直接匹配
|
||
if cleaned_key in name_to_code_map:
|
||
correct_code = name_to_code_map[cleaned_key]
|
||
normalized_data[correct_code] = value
|
||
continue
|
||
|
||
# 尝试不区分大小写匹配
|
||
matched = False
|
||
for name, code in name_to_code_map.items():
|
||
if cleaned_key.lower() == name.lower():
|
||
normalized_data[code] = value
|
||
matched = True
|
||
break
|
||
|
||
if not matched:
|
||
# 尝试模糊匹配:处理拼写错误(如 targetsProfessionalRank -> target_professional_rank)
|
||
# 移除常见的前缀/后缀错误(如 targets -> target)
|
||
normalized_key = cleaned_key
|
||
if normalized_key.startswith('targets'):
|
||
normalized_key = 'target' + normalized_key[7:] # targets -> target
|
||
elif normalized_key.startswith('targets_'):
|
||
normalized_key = 'target_' + normalized_key[8:] # targets_ -> target_
|
||
|
||
# 尝试匹配规范化后的key
|
||
if normalized_key in name_to_code_map:
|
||
correct_code = name_to_code_map[normalized_key]
|
||
normalized_data[correct_code] = value
|
||
matched = True
|
||
print(f"[AI服务] 拼写修正: '{cleaned_key}' -> '{normalized_key}' -> '{correct_code}'")
|
||
elif normalized_key.lower() in [k.lower() for k in name_to_code_map.keys()]:
|
||
for name, code in name_to_code_map.items():
|
||
if normalized_key.lower() == name.lower():
|
||
normalized_data[code] = value
|
||
matched = True
|
||
print(f"[AI服务] 拼写修正(不区分大小写): '{cleaned_key}' -> '{normalized_key}' -> '{code}'")
|
||
break
|
||
|
||
if not matched:
|
||
# 如果找不到匹配,尝试模糊匹配
|
||
# 但跳过空字段名,避免错误匹配
|
||
if cleaned_key:
|
||
# 检查是否包含字段编码的关键部分
|
||
for field_code in field_code_map.keys():
|
||
# 如果清理后的key包含字段编码的关键部分,或者字段编码包含key的关键部分
|
||
key_parts = cleaned_key.lower().replace('_', '').replace('-', '').replace('targets', 'target')
|
||
code_parts = field_code.lower().replace('_', '').replace('-', '')
|
||
|
||
# 检查相似度(简单匹配),但要求key_parts不为空
|
||
if key_parts and (key_parts in code_parts or code_parts in key_parts):
|
||
# 如果该字段已经有值,且新值为空,则不覆盖
|
||
if field_code in normalized_data and normalized_data[field_code] and not value:
|
||
print(f"[AI服务] 跳过模糊匹配(已有非空值): '{cleaned_key}' -> '{field_code}' (已有值: '{normalized_data[field_code]}')")
|
||
matched = True
|
||
break
|
||
normalized_data[field_code] = value
|
||
matched = True
|
||
print(f"[AI服务] 模糊匹配: '{cleaned_key}' -> '{field_code}'")
|
||
break
|
||
|
||
if not matched:
|
||
# 如果仍然找不到匹配,保留原字段名(可能模型返回了意外的字段)
|
||
# 但跳过空字段名
|
||
if cleaned_key:
|
||
print(f"[AI服务] 警告:无法匹配字段名 '{cleaned_key}',保留原字段名")
|
||
normalized_data[cleaned_key] = value
|
||
else:
|
||
print(f"[AI服务] 跳过空字段名,无法匹配")
|
||
|
||
# 确保所有输出字段都有对应的值(即使为空字符串)
|
||
for field_code in field_code_map.keys():
|
||
if field_code not in normalized_data:
|
||
normalized_data[field_code] = ''
|
||
|
||
return normalized_data
|
||
|
||
def _normalize_date_formats(self, data: Dict, output_fields: List[Dict]) -> Dict:
|
||
"""
|
||
规范化日期格式,确保日期格式正确
|
||
输出格式:YYYY年MM月 或 YYYY年MM月DD日
|
||
|
||
Args:
|
||
data: 提取的数据字典
|
||
output_fields: 输出字段列表
|
||
|
||
Returns:
|
||
规范化后的数据字典
|
||
"""
|
||
# 创建字段编码到字段信息的映射
|
||
field_code_map = {field['field_code']: field for field in output_fields}
|
||
|
||
# 处理出生年月字段 (target_date_of_birth)
|
||
if 'target_date_of_birth' in data and data['target_date_of_birth']:
|
||
date_value = str(data['target_date_of_birth']).strip()
|
||
if date_value:
|
||
# 尝试规范化日期格式为 YYYY年MM月
|
||
normalized_date = self._normalize_date_to_chinese_yyyymm(date_value)
|
||
if normalized_date and normalized_date != date_value:
|
||
print(f"[AI服务] 日期格式规范化: '{date_value}' -> '{normalized_date}'")
|
||
data['target_date_of_birth'] = normalized_date
|
||
|
||
# 处理出生年月日字段 (target_date_of_birth_full)
|
||
if 'target_date_of_birth_full' in data and data['target_date_of_birth_full']:
|
||
date_value = str(data['target_date_of_birth_full']).strip()
|
||
if date_value:
|
||
# 尝试规范化日期格式为 YYYY年MM月DD日
|
||
normalized_date = self._normalize_date_to_chinese_yyyymmdd(date_value)
|
||
if normalized_date and normalized_date != date_value:
|
||
print(f"[AI服务] 日期格式规范化: '{date_value}' -> '{normalized_date}'")
|
||
data['target_date_of_birth_full'] = normalized_date
|
||
|
||
return data
|
||
|
||
def _normalize_date_to_chinese_yyyymm(self, date_str: str) -> Optional[str]:
|
||
"""
|
||
将日期字符串规范化为 YYYY年MM月 格式(中文格式)
|
||
|
||
Args:
|
||
date_str: 日期字符串,可能是各种格式
|
||
|
||
Returns:
|
||
规范化后的日期字符串(YYYY年MM月格式),如果无法解析则返回原值
|
||
"""
|
||
if not date_str:
|
||
return None
|
||
|
||
date_str = date_str.strip()
|
||
|
||
# 如果已经是中文格式(YYYY年MM月),检查并规范化
|
||
match = re.search(r'(\d{4})年(\d{1,2})月', date_str)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2).zfill(2) # 补零到2位
|
||
if 1 <= int(month) <= 12:
|
||
return f"{year}年{month}月"
|
||
|
||
# 如果是6位数字格式(YYYYMM),转换为中文格式
|
||
if re.match(r'^\d{6}$', date_str):
|
||
year = date_str[:4]
|
||
month = date_str[4:].lstrip('0') or '01' # 去掉前导零,但如果全是0则设为01
|
||
month = month.zfill(2) # 补零到2位
|
||
if 1 <= int(month) <= 12:
|
||
return f"{year}年{month}月"
|
||
|
||
# 如果是5位数字(如19805),尝试修复
|
||
if re.match(r'^\d{5}$', date_str):
|
||
year = date_str[:4]
|
||
month = date_str[4:].zfill(2)
|
||
if 1 <= int(month) <= 12:
|
||
return f"{year}年{month}月"
|
||
|
||
# 格式2: "1980-5" 或 "1980-05"
|
||
match = re.search(r'(\d{4})-(\d{1,2})', date_str)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2).zfill(2)
|
||
if 1 <= int(month) <= 12:
|
||
return f"{year}年{month}月"
|
||
|
||
# 格式3: "1980/5" 或 "1980/05"
|
||
match = re.search(r'(\d{4})/(\d{1,2})', date_str)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2).zfill(2)
|
||
if 1 <= int(month) <= 12:
|
||
return f"{year}年{month}月"
|
||
|
||
# 如果只有年份,补充月份为01
|
||
if re.match(r'^\d{4}$', date_str):
|
||
return f"{date_str}年01月"
|
||
|
||
# 如果无法解析,返回原值
|
||
return date_str
|
||
|
||
def _normalize_date_to_chinese_yyyymmdd(self, date_str: str) -> Optional[str]:
|
||
"""
|
||
将日期字符串规范化为 YYYY年MM月DD日 格式(中文格式)
|
||
|
||
Args:
|
||
date_str: 日期字符串,可能是各种格式
|
||
|
||
Returns:
|
||
规范化后的日期字符串(YYYY年MM月DD日格式),如果无法解析则返回原值
|
||
"""
|
||
if not date_str:
|
||
return None
|
||
|
||
date_str = date_str.strip()
|
||
|
||
# 如果已经是中文格式(YYYY年MM月DD日),检查并规范化
|
||
match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_str)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2).zfill(2) # 补零到2位
|
||
day = match.group(3).zfill(2) # 补零到2位
|
||
if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
||
return f"{year}年{month}月{day}日"
|
||
|
||
# 如果是8位数字格式(YYYYMMDD),转换为中文格式
|
||
if re.match(r'^\d{8}$', date_str):
|
||
year = date_str[:4]
|
||
month = date_str[4:6].lstrip('0') or '01'
|
||
month = month.zfill(2)
|
||
day = date_str[6:8].lstrip('0') or '01'
|
||
day = day.zfill(2)
|
||
if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
||
return f"{year}年{month}月{day}日"
|
||
|
||
# 尝试解析各种日期格式
|
||
# 格式2: "1980-5-15" 或 "1980-05-15"
|
||
match = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', date_str)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2).zfill(2)
|
||
day = match.group(3).zfill(2)
|
||
if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
||
return f"{year}年{month}月{day}日"
|
||
|
||
# 格式3: "1980/5/15" 或 "1980/05/15"
|
||
match = re.search(r'(\d{4})/(\d{1,2})/(\d{1,2})', date_str)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2).zfill(2)
|
||
day = match.group(3).zfill(2)
|
||
if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
||
return f"{year}年{month}月{day}日"
|
||
|
||
# 如果只有年月,补充日期为01日
|
||
normalized_yyyymm = self._normalize_date_to_chinese_yyyymm(date_str)
|
||
if normalized_yyyymm and '年' in normalized_yyyymm and '月' in normalized_yyyymm:
|
||
# 从"YYYY年MM月"中提取年月,补充日期
|
||
match = re.search(r'(\d{4})年(\d{2})月', normalized_yyyymm)
|
||
if match:
|
||
year = match.group(1)
|
||
month = match.group(2)
|
||
return f"{year}年{month}月01日"
|
||
|
||
# 如果无法解析,返回原值
|
||
return date_str
|
||
|
||
def _post_process_inferred_fields(self, data: Dict, output_fields: List[Dict]) -> Dict:
|
||
"""
|
||
后处理:从已有信息推断缺失字段
|
||
|
||
Args:
|
||
data: 提取的数据字典
|
||
output_fields: 输出字段列表
|
||
|
||
Returns:
|
||
后处理后的数据字典
|
||
"""
|
||
# 创建字段编码到字段信息的映射
|
||
field_code_map = {field['field_code']: field for field in output_fields}
|
||
|
||
# 1. 从出生年月计算年龄
|
||
if 'target_age' in field_code_map and (not data.get('target_age') or data.get('target_age') == ''):
|
||
if 'target_date_of_birth' in data and data.get('target_date_of_birth'):
|
||
age = self._calculate_age_from_birth_date(data['target_date_of_birth'])
|
||
if age:
|
||
data['target_age'] = str(age)
|
||
print(f"[AI服务] 后处理:从出生年月 '{data['target_date_of_birth']}' 计算年龄: {age}岁")
|
||
|
||
# 2. 从单位及职务中拆分单位和职务
|
||
if 'target_organization_and_position' in data and data.get('target_organization_and_position'):
|
||
org_pos = data['target_organization_and_position']
|
||
|
||
# 拆分单位
|
||
if 'target_organization' in field_code_map and (not data.get('target_organization') or data.get('target_organization') == ''):
|
||
org = self._extract_organization_from_org_pos(org_pos)
|
||
if org:
|
||
data['target_organization'] = org
|
||
print(f"[AI服务] 后处理:从单位及职务 '{org_pos}' 提取单位: {org}")
|
||
|
||
# 拆分职务
|
||
if 'target_position' in field_code_map and (not data.get('target_position') or data.get('target_position') == ''):
|
||
pos = self._extract_position_from_org_pos(org_pos)
|
||
if pos:
|
||
data['target_position'] = pos
|
||
print(f"[AI服务] 后处理:从单位及职务 '{org_pos}' 提取职务: {pos}")
|
||
|
||
return data
|
||
|
||
def _calculate_age_from_birth_date(self, birth_date: str) -> Optional[int]:
|
||
"""
|
||
从出生年月计算年龄
|
||
|
||
Args:
|
||
birth_date: 出生年月,格式如 "1980年05月" 或 "198005"
|
||
|
||
Returns:
|
||
年龄(整数),如果无法计算则返回None
|
||
"""
|
||
if not birth_date:
|
||
return None
|
||
|
||
birth_date = str(birth_date).strip()
|
||
|
||
# 提取年份
|
||
year_match = re.search(r'(\d{4})', birth_date)
|
||
if not year_match:
|
||
return None
|
||
|
||
birth_year = int(year_match.group(1))
|
||
current_year = 2024 # 当前年份
|
||
|
||
# 计算年龄
|
||
age = current_year - birth_year
|
||
|
||
# 验证年龄合理性(0-150岁)
|
||
if 0 <= age <= 150:
|
||
return age
|
||
|
||
return None
|
||
|
||
def _extract_organization_from_org_pos(self, org_pos: str) -> Optional[str]:
|
||
"""
|
||
从单位及职务中提取单位名称
|
||
|
||
Args:
|
||
org_pos: 单位及职务,如 "某公司总经理"
|
||
|
||
Returns:
|
||
单位名称,如 "某公司"
|
||
"""
|
||
if not org_pos:
|
||
return None
|
||
|
||
org_pos = str(org_pos).strip()
|
||
|
||
# 常见职务关键词
|
||
position_keywords = [
|
||
'总经理', '经理', '局长', '处长', '科长', '主任', '书记', '部长',
|
||
'副部长', '副经理', '副局长', '副处长', '副科长', '副主任', '副书记',
|
||
'董事长', '副董事长', '总裁', '副总裁', '总监', '副总监',
|
||
'部长', '副部长', '司长', '副司长', '厅长', '副厅长',
|
||
'市长', '副市长', '县长', '副县长', '乡长', '副乡长',
|
||
'镇长', '副镇长', '村长', '副村长'
|
||
]
|
||
|
||
# 尝试匹配:单位 + 职务
|
||
for pos_keyword in position_keywords:
|
||
if pos_keyword in org_pos:
|
||
# 找到职务位置,提取前面的单位部分
|
||
pos_index = org_pos.find(pos_keyword)
|
||
if pos_index > 0:
|
||
org = org_pos[:pos_index].strip()
|
||
if org:
|
||
return org
|
||
|
||
# 如果没有找到明确的职务关键词,尝试其他模式
|
||
# 例如:"XX公司XX部门XX职务"
|
||
# 这里简单返回,可能需要更复杂的逻辑
|
||
|
||
return None
|
||
|
||
def _extract_position_from_org_pos(self, org_pos: str) -> Optional[str]:
|
||
"""
|
||
从单位及职务中提取职务名称
|
||
|
||
Args:
|
||
org_pos: 单位及职务,如 "某公司总经理"
|
||
|
||
Returns:
|
||
职务名称,如 "总经理"
|
||
"""
|
||
if not org_pos:
|
||
return None
|
||
|
||
org_pos = str(org_pos).strip()
|
||
|
||
# 常见职务关键词
|
||
position_keywords = [
|
||
'总经理', '经理', '局长', '处长', '科长', '主任', '书记', '部长',
|
||
'副部长', '副经理', '副局长', '副处长', '副科长', '副主任', '副书记',
|
||
'董事长', '副董事长', '总裁', '副总裁', '总监', '副总监',
|
||
'部长', '副部长', '司长', '副司长', '厅长', '副厅长',
|
||
'市长', '副市长', '县长', '副县长', '乡长', '副乡长',
|
||
'镇长', '副镇长', '村长', '副村长'
|
||
]
|
||
|
||
# 按长度从长到短排序,优先匹配长关键词(如"副经理"优先于"经理")
|
||
position_keywords.sort(key=len, reverse=True)
|
||
|
||
# 尝试匹配职务关键词
|
||
for pos_keyword in position_keywords:
|
||
if pos_keyword in org_pos:
|
||
return pos_keyword
|
||
|
||
return None
|
||
|
||
def _parse_text_response(self, text: str, output_fields: List[Dict]) -> Dict:
|
||
"""
|
||
从文本响应中解析字段值(备用方案)
|
||
"""
|
||
result = {}
|
||
for field in output_fields:
|
||
field_code = field['field_code']
|
||
field_name = field['name']
|
||
|
||
# 尝试在文本中查找字段值
|
||
# 这里使用简单的关键词匹配,实际可以更复杂
|
||
if field_name in text:
|
||
# 提取字段值(简单实现)
|
||
start_idx = text.find(field_name)
|
||
if start_idx != -1:
|
||
# 查找冒号后的内容
|
||
colon_idx = text.find(':', start_idx)
|
||
if colon_idx != -1:
|
||
value_start = colon_idx + 1
|
||
value_end = text.find('\n', value_start)
|
||
if value_end == -1:
|
||
value_end = len(text)
|
||
value = text[value_start:value_end].strip()
|
||
result[field_code] = value
|
||
else:
|
||
result[field_code] = ''
|
||
else:
|
||
result[field_code] = ''
|
||
else:
|
||
result[field_code] = ''
|
||
|
||
return result
|
||
|