更新提示配置，统一日期格式为中文格式，增强AI服务的日期规范化功能，添加对常见拼写错误的处理逻辑，改进字段名清理和规范化方法以提高数据提取准确性。

2025-12-09 12:34:01 +08:00 · 2025-12-09 12:34:01 +08:00 · e1d8d27dc4
commit e1d8d27dc4
parent e31cd0b764
2 changed files with 255 additions and 15 deletions
--- a/config/prompt_config.json
+++ b/config/prompt_config.json
@ -11,7 +11,7 @@
      "如果文本中明确提到某个信息，必须提取出来，不能设为空",
      "如果可以通过已有信息合理推断（如根据出生年月推算年龄），请进行推断并填写",
      "如果某个字段在输入文本中确实找不到任何相关信息，该字段值才设为空字符串\"\"",
-      "日期格式统一为YYYYMM（如：198005表示1980年5月），如果包含日期信息则格式为YYYYMMDD",
+      "日期格式统一为中文格式：YYYY年MM月（如：1980年05月表示1980年5月），如果包含日期信息则格式为YYYY年MM月DD日（如：1985年05月17日）。注意：年份必须是4位数字，月份和日期必须是2位数字（如1980年5月应格式化为1980年05月，不是1980年5月）",
      "性别统一为\"男\"或\"女\"，不要使用\"男性\"或\"女性\"。如果文本中提到\"男性\"、\"男\"、\"先生\"等，统一转换为\"男\"；如果提到\"女性\"、\"女\"、\"女士\"等，统一转换为\"女\"",
      "年龄字段：如果文本中直接提到年龄（如\"30岁\"、\"30周岁\"），直接提取数字；如果只有出生年月，可以根据当前年份计算年龄（当前年份为2024年）",
      "单位及职务字段：如果文本中提到\"XX公司总经理\"、\"XX单位XX职务\"等，需要同时提取单位名称和职务名称",
@ -25,7 +25,10 @@
      "身份证号码只提取数字，不包含其他字符",
      "联系方式提取电话号码，格式化为纯数字",
      "地址信息保持完整，包含省市区街道等详细信息",
-      "只返回JSON对象，不要包含markdown代码块标记、思考过程或其他说明文字"
+      "只返回JSON对象，不要包含markdown代码块标记、思考过程或其他说明文字",
+      "JSON格式要求：所有字段名必须使用双引号，字段名中不能包含前导点（如不能使用\".target_gender\"，应使用\"target_gender\"），字段名前后不能有空格",
+      "必须返回所有要求的字段，即使值为空字符串也要包含在JSON中",
+      "字段名必须严格按照JSON示例中的字段编码，不能随意修改或拼写错误（如不能使用\"targetsProfessionalRank\"，应使用\"target_professional_rank\"）"
    ]
  },
  "field_formatting": {
@ -51,16 +54,22 @@
    "target_date_of_birth": {
      "description": "被核查人员出生年月",
      "rules": [
-        "格式：YYYYMM，如198005表示1980年5月",
-        "如果只有年份，月份设为01",
-        "如果文本中提到\"X年X月X日出生\"，只提取年月，忽略日期"
+        "格式：YYYY年MM月（中文格式），如1980年05月表示1980年5月（注意：月份必须是2位数字，如5月应写为05月，不是5月）",
+        "如果只有年份，月份设为01（如1980年应格式化为1980年01月）",
+        "如果文本中提到\"X年X月X日出生\"，只提取年月，忽略日期",
+        "如果文本中提到\"1980年5月\"，格式化为\"1980年05月\"（月份补零）",
+        "如果文本中提到\"1980年05月\"，保持为\"1980年05月\"",
+        "年份必须是4位数字，月份必须是2位数字（01-12）",
+        "输出格式示例：1980年05月、1985年03月、1990年12月"
      ]
    },
    "target_date_of_birth_full": {
      "description": "被核查人员出生年月日",
      "rules": [
-        "格式：YYYYMMDD，如19800515表示1980年5月15日",
-        "如果只有年月，日期设为01"
+        "格式：YYYY年MM月DD日（中文格式），如1985年05月17日表示1985年5月17日",
+        "如果只有年月，日期设为01（如1980年05月应格式化为1980年05月01日）",
+        "年份必须是4位数字，月份和日期必须是2位数字（01-12和01-31）",
+        "输出格式示例：1985年05月17日、1980年03月15日、1990年12月01日"
      ]
    },
    "target_political_status": {
--- a/services/ai_service.py
+++ b/services/ai_service.py
@ -295,6 +295,8 @@ class AIService:
                # 规范化字段名并映射到正确的字段编码
                normalized_data = self._normalize_field_names(extracted_data, output_fields)
                print(f"[AI服务] 规范化后的字段名: {list(normalized_data.keys())}")
+                # 规范化日期格式
+                normalized_data = self._normalize_date_formats(normalized_data, output_fields)
                return normalized_data
            
            # 如果无法提取JSON，记录错误
@ -435,13 +437,22 @@ class AIService:
        json_str = re.sub(r',\s*]', ']', json_str)
        
        # 修复字段名中的错误（如 .target_gender -> target_gender）
-        json_str = re.sub(r'["\']\.([^"\']+)["\']\s*:', r'"\1":', json_str)
+        # 处理前导点和尾随空格
+        json_str = re.sub(r'["\']\s*\.([^"\']+?)\s*["\']\s*:', r'"\1":', json_str)
+        json_str = re.sub(r'["\']\.([^"\']+?)["\']\s*:', r'"\1":', json_str)
        
        # 修复字段名中的空格（如 "target name" -> "target_name"）
-        json_str = re.sub(r'["\']([^"\']+)\s+([^"\']+)["\']\s*:', r'"\1_\2":', json_str)
+        json_str = re.sub(r'["\']([^"\']+?)\s+([^"\']+?)["\']\s*:', r'"\1_\2":', json_str)
        
-        # 尝试修复未加引号的字段名
-        json_str = re.sub(r'(\w+)\s*:', r'"\1":', json_str)
+        # 修复字段名中的尾随空格（如 "target_gender " -> "target_gender"）
+        json_str = re.sub(r'["\']([^"\']+?)\s+["\']\s*:', r'"\1":', json_str)
+        
+        # 修复字段名中的前导空格（如 " target_gender" -> "target_gender"）
+        json_str = re.sub(r'["\']\s+([^"\']+?)["\']\s*:', r'"\1":', json_str)
+        
+        # 尝试修复未加引号的字段名（但要避免破坏字符串值）
+        # 只修复在冒号前的未加引号的标识符
+        json_str = re.sub(r'([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str)
        
        return json_str
    
@ -519,6 +530,37 @@ class AIService:
            if code in field_code_map:
                name_to_code_map[schema_key] = code
        
+        # 添加常见拼写错误的映射（如 targetsProfessionalRank -> target_professional_rank）
+        typo_mapping = {
+            'targetsProfessionalRank': 'target_professional_rank',
+            'targetProfessionalRank': 'target_professional_rank',
+            'targets_professional_rank': 'target_professional_rank',
+            'targetsProfessional': 'target_professional_rank',
+            'professionalRank': 'target_professional_rank',
+            'targetGender': 'target_gender',
+            'targetsGender': 'target_gender',
+            'targetDateOfBirth': 'target_date_of_birth',
+            'targetsDateOfBirth': 'target_date_of_birth',
+            'targetPoliticalStatus': 'target_political_status',
+            'targetsPoliticalStatus': 'target_political_status',
+            'targetOrganizationAndPosition': 'target_organization_and_position',
+            'targetsOrganizationAndPosition': 'target_organization_and_position',
+            'targetOrganization': 'target_organization',
+            'targetsOrganization': 'target_organization',
+            'targetPosition': 'target_position',
+            'targetsPosition': 'target_position',
+            'targetEducationLevel': 'target_education_level',
+            'targetsEducationLevel': 'target_education_level',
+            'targetAge': 'target_age',
+            'targetsAge': 'target_age',
+            'targetIssueDescription': 'target_issue_description',
+            'targetsIssueDescription': 'target_issue_description',
+        }
+        # 添加拼写错误映射（仅当字段编码存在时）
+        for typo_key, code in typo_mapping.items():
+            if code in field_code_map:
+                name_to_code_map[typo_key] = code
+        
        # 规范化数据
        normalized_data = {}
        
@ -539,7 +581,7 @@ class AIService:
                    value = str(value) if value else ''
            
            # 清理字段名：去掉前导点、空格等
-            cleaned_key = key.strip().lstrip('.')
+            cleaned_key = key.strip().lstrip('.').rstrip()
            
            # 尝试直接匹配
            if cleaned_key in name_to_code_map:
@ -548,17 +590,42 @@ class AIService:
                continue
            
            # 尝试不区分大小写匹配
+            matched = False
            for name, code in name_to_code_map.items():
                if cleaned_key.lower() == name.lower():
                    normalized_data[code] = value
+                    matched = True
                    break
-            else:
+            
+            if not matched:
+                # 尝试模糊匹配：处理拼写错误（如 targetsProfessionalRank -> target_professional_rank）
+                # 移除常见的前缀/后缀错误（如 targets -> target）
+                normalized_key = cleaned_key
+                if normalized_key.startswith('targets'):
+                    normalized_key = 'target' + normalized_key[7:]  # targets -> target
+                elif normalized_key.startswith('targets_'):
+                    normalized_key = 'target_' + normalized_key[8:]  # targets_ -> target_
+                
+                # 尝试匹配规范化后的key
+                if normalized_key in name_to_code_map:
+                    correct_code = name_to_code_map[normalized_key]
+                    normalized_data[correct_code] = value
+                    matched = True
+                    print(f"[AI服务] 拼写修正: '{cleaned_key}' -> '{normalized_key}' -> '{correct_code}'")
+                elif normalized_key.lower() in [k.lower() for k in name_to_code_map.keys()]:
+                    for name, code in name_to_code_map.items():
+                        if normalized_key.lower() == name.lower():
+                            normalized_data[code] = value
+                            matched = True
+                            print(f"[AI服务] 拼写修正（不区分大小写）: '{cleaned_key}' -> '{normalized_key}' -> '{code}'")
+                            break
+            
+            if not matched:
                # 如果找不到匹配，尝试模糊匹配
                # 检查是否包含字段编码的关键部分
-                matched = False
                for field_code in field_code_map.keys():
                    # 如果清理后的key包含字段编码的关键部分，或者字段编码包含key的关键部分
-                    key_parts = cleaned_key.lower().replace('_', '').replace('-', '')
+                    key_parts = cleaned_key.lower().replace('_', '').replace('-', '').replace('targets', 'target')
                    code_parts = field_code.lower().replace('_', '').replace('-', '')
                    
                    # 检查相似度（简单匹配）
@ -580,6 +647,170 @@ class AIService:
        
        return normalized_data
    
+    def _normalize_date_formats(self, data: Dict, output_fields: List[Dict]) -> Dict:
+        """
+        规范化日期格式，确保日期格式正确
+        输出格式：YYYY年MM月 或 YYYY年MM月DD日
+        
+        Args:
+            data: 提取的数据字典
+            output_fields: 输出字段列表
+            
+        Returns:
+            规范化后的数据字典
+        """
+        # 创建字段编码到字段信息的映射
+        field_code_map = {field['field_code']: field for field in output_fields}
+        
+        # 处理出生年月字段 (target_date_of_birth)
+        if 'target_date_of_birth' in data and data['target_date_of_birth']:
+            date_value = str(data['target_date_of_birth']).strip()
+            if date_value:
+                # 尝试规范化日期格式为 YYYY年MM月
+                normalized_date = self._normalize_date_to_chinese_yyyymm(date_value)
+                if normalized_date and normalized_date != date_value:
+                    print(f"[AI服务] 日期格式规范化: '{date_value}' -> '{normalized_date}'")
+                    data['target_date_of_birth'] = normalized_date
+        
+        # 处理出生年月日字段 (target_date_of_birth_full)
+        if 'target_date_of_birth_full' in data and data['target_date_of_birth_full']:
+            date_value = str(data['target_date_of_birth_full']).strip()
+            if date_value:
+                # 尝试规范化日期格式为 YYYY年MM月DD日
+                normalized_date = self._normalize_date_to_chinese_yyyymmdd(date_value)
+                if normalized_date and normalized_date != date_value:
+                    print(f"[AI服务] 日期格式规范化: '{date_value}' -> '{normalized_date}'")
+                    data['target_date_of_birth_full'] = normalized_date
+        
+        return data
+    
+    def _normalize_date_to_chinese_yyyymm(self, date_str: str) -> Optional[str]:
+        """
+        将日期字符串规范化为 YYYY年MM月 格式（中文格式）
+        
+        Args:
+            date_str: 日期字符串，可能是各种格式
+            
+        Returns:
+            规范化后的日期字符串（YYYY年MM月格式），如果无法解析则返回原值
+        """
+        if not date_str:
+            return None
+        
+        date_str = date_str.strip()
+        
+        # 如果已经是中文格式（YYYY年MM月），检查并规范化
+        match = re.search(r'(\d{4})年(\d{1,2})月', date_str)
+        if match:
+            year = match.group(1)
+            month = match.group(2).zfill(2)  # 补零到2位
+            if 1 <= int(month) <= 12:
+                return f"{year}年{month}月"
+        
+        # 如果是6位数字格式（YYYYMM），转换为中文格式
+        if re.match(r'^\d{6}$', date_str):
+            year = date_str[:4]
+            month = date_str[4:].lstrip('0') or '01'  # 去掉前导零，但如果全是0则设为01
+            month = month.zfill(2)  # 补零到2位
+            if 1 <= int(month) <= 12:
+                return f"{year}年{month}月"
+        
+        # 如果是5位数字（如19805），尝试修复
+        if re.match(r'^\d{5}$', date_str):
+            year = date_str[:4]
+            month = date_str[4:].zfill(2)
+            if 1 <= int(month) <= 12:
+                return f"{year}年{month}月"
+        
+        # 格式2: "1980-5" 或 "1980-05"
+        match = re.search(r'(\d{4})-(\d{1,2})', date_str)
+        if match:
+            year = match.group(1)
+            month = match.group(2).zfill(2)
+            if 1 <= int(month) <= 12:
+                return f"{year}年{month}月"
+        
+        # 格式3: "1980/5" 或 "1980/05"
+        match = re.search(r'(\d{4})/(\d{1,2})', date_str)
+        if match:
+            year = match.group(1)
+            month = match.group(2).zfill(2)
+            if 1 <= int(month) <= 12:
+                return f"{year}年{month}月"
+        
+        # 如果只有年份，补充月份为01
+        if re.match(r'^\d{4}$', date_str):
+            return f"{date_str}年01月"
+        
+        # 如果无法解析，返回原值
+        return date_str
+    
+    def _normalize_date_to_chinese_yyyymmdd(self, date_str: str) -> Optional[str]:
+        """
+        将日期字符串规范化为 YYYY年MM月DD日 格式（中文格式）
+        
+        Args:
+            date_str: 日期字符串，可能是各种格式
+            
+        Returns:
+            规范化后的日期字符串（YYYY年MM月DD日格式），如果无法解析则返回原值
+        """
+        if not date_str:
+            return None
+        
+        date_str = date_str.strip()
+        
+        # 如果已经是中文格式（YYYY年MM月DD日），检查并规范化
+        match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_str)
+        if match:
+            year = match.group(1)
+            month = match.group(2).zfill(2)  # 补零到2位
+            day = match.group(3).zfill(2)  # 补零到2位
+            if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+                return f"{year}年{month}月{day}日"
+        
+        # 如果是8位数字格式（YYYYMMDD），转换为中文格式
+        if re.match(r'^\d{8}$', date_str):
+            year = date_str[:4]
+            month = date_str[4:6].lstrip('0') or '01'
+            month = month.zfill(2)
+            day = date_str[6:8].lstrip('0') or '01'
+            day = day.zfill(2)
+            if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+                return f"{year}年{month}月{day}日"
+        
+        # 尝试解析各种日期格式
+        # 格式2: "1980-5-15" 或 "1980-05-15"
+        match = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', date_str)
+        if match:
+            year = match.group(1)
+            month = match.group(2).zfill(2)
+            day = match.group(3).zfill(2)
+            if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+                return f"{year}年{month}月{day}日"
+        
+        # 格式3: "1980/5/15" 或 "1980/05/15"
+        match = re.search(r'(\d{4})/(\d{1,2})/(\d{1,2})', date_str)
+        if match:
+            year = match.group(1)
+            month = match.group(2).zfill(2)
+            day = match.group(3).zfill(2)
+            if 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+                return f"{year}年{month}月{day}日"
+        
+        # 如果只有年月，补充日期为01日
+        normalized_yyyymm = self._normalize_date_to_chinese_yyyymm(date_str)
+        if normalized_yyyymm and '年' in normalized_yyyymm and '月' in normalized_yyyymm:
+            # 从"YYYY年MM月"中提取年月，补充日期
+            match = re.search(r'(\d{4})年(\d{2})月', normalized_yyyymm)
+            if match:
+                year = match.group(1)
+                month = match.group(2)
+                return f"{year}年{month}月01日"
+        
+        # 如果无法解析，返回原值
+        return date_str
+    
    def _parse_text_response(self, text: str, output_fields: List[Dict]) -> Dict:
        """
        从文本响应中解析字段值（备用方案）