From 684cb0141a13b6fe56ea748b2fdb906448fad819 Mon Sep 17 00:00:00 2001
From: python <liji517@qq.com>
Date: Tue, 9 Dec 2025 11:30:02 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=BC=BAAI=E6=9C=8D=E5=8A=A1=E7=9A=84?=
 =?UTF-8?q?JSON=E6=8F=90=E5=8F=96=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E4=BA=86=E4=BB=8E=E6=96=87=E6=9C=AC=E4=B8=AD=E6=8F=90?=
 =?UTF-8?q?=E5=8F=96JSON=E5=AF=B9=E8=B1=A1=E7=9A=84=E6=96=B9=E6=B3=95?=
 =?UTF-8?q?=EF=BC=8C=E6=94=B9=E8=BF=9B=E4=BA=86=E5=AF=B9=E5=8D=8E=E4=B8=BA?=
 =?UTF-8?q?=E5=A4=A7=E6=A8=A1=E5=9E=8B=E8=BF=94=E5=9B=9E=E5=86=85=E5=AE=B9?=
 =?UTF-8?q?=E7=9A=84=E5=A4=84=E7=90=86=EF=BC=8C=E7=A1=AE=E4=BF=9D=E5=8F=AA?=
 =?UTF-8?q?=E8=BF=94=E5=9B=9EJSON=E5=AF=B9=E8=B1=A1=E8=80=8C=E4=B8=8D?=
 =?UTF-8?q?=E5=8C=85=E5=90=AB=E5=85=B6=E4=BB=96=E8=AF=B4=E6=98=8E=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 services/ai_service.py | 105 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 89 insertions(+), 16 deletions(-)
diff --git a/services/ai_service.py b/services/ai_service.py
index 2581a79..4fa434e 100644
--- a/services/ai_service.py
+++ b/services/ai_service.py
@@ -3,6 +3,7 @@ AI服务 - 封装大模型调用
 仅支持华为大模型
 """
 import os
+import re
 import requests
 import json
 from typing import Dict, List, Optional
@@ -122,7 +123,7 @@ class AIService:
                 "messages": [
                     {
                         "role": "system",
-                        "content": "你是一个专业的数据提取助手，能够从文本中准确提取结构化信息。请严格按照JSON格式返回结果。"
+                        "content": "你是一个专业的数据提取助手，能够从文本中准确提取结构化信息。请严格按照JSON格式返回结果，只返回JSON对象，不要包含任何其他文字说明、思考过程或markdown代码块标记。"
                     },
                     {
                         "role": "user",
@@ -162,23 +163,19 @@ class AIService:
             if 'choices' in result and len(result['choices']) > 0:
                 content = result['choices'][0]['message']['content']
                 
+                # 处理思考过程标签（华为大模型可能返回思考过程）
+                # 移除思考过程标签之前的内容，只保留实际回答
+                # 根据用户提供的示例，华为大模型使用 </think> 标签
+                if '</think>' in content:
+                    content = content.split('</think>')[-1].strip()
+                
                 # 尝试解析JSON
-                try:
-                    # 如果返回的是代码块，提取JSON部分
-                    if '```json' in content:
-                        json_start = content.find('```json') + 7
-                        json_end = content.find('```', json_start)
-                        content = content[json_start:json_end].strip()
-                    elif '```' in content:
-                        json_start = content.find('```') + 3
-                        json_end = content.find('```', json_start)
-                        content = content[json_start:json_end].strip()
-                    
-                    extracted_data = json.loads(content)
+                extracted_data = self._extract_json_from_text(content)
+                if extracted_data:
                     return extracted_data
-                except json.JSONDecodeError:
-                    # 如果不是JSON，尝试从文本中提取
-                    return self._parse_text_response(content, output_fields)
+                
+                # 如果无法提取JSON，尝试从文本中提取
+                return self._parse_text_response(content, output_fields)
             else:
                 raise Exception("API返回格式异常")
                 
@@ -187,6 +184,82 @@ class AIService:
         except Exception as e:
             raise Exception(f"AI服务调用失败: {str(e)}")
     
+    def _extract_json_from_text(self, text: str) -> Optional[Dict]:
+        """
+        从文本中提取JSON对象
+        支持多种格式：
+        1. 纯JSON对象
+        2. 包裹在 ```json 代码块中的JSON
+        3. 包裹在 ``` 代码块中的JSON
+        4. 文本中包含的JSON对象
+        """
+        # 方法1: 尝试提取代码块中的JSON
+        if '```json' in text:
+            json_start = text.find('```json') + 7
+            json_end = text.find('```', json_start)
+            if json_end != -1:
+                json_str = text[json_start:json_end].strip()
+                try:
+                    return json.loads(json_str)
+                except json.JSONDecodeError:
+                    pass
+        
+        if '```' in text:
+            json_start = text.find('```') + 3
+            json_end = text.find('```', json_start)
+            if json_end != -1:
+                json_str = text[json_start:json_end].strip()
+                # 如果不是json标记，尝试解析
+                try:
+                    return json.loads(json_str)
+                except json.JSONDecodeError:
+                    pass
+        
+        # 方法2: 尝试直接解析整个文本
+        try:
+            return json.loads(text.strip())
+        except json.JSONDecodeError:
+            pass
+        
+        # 方法3: 尝试查找文本中的JSON对象（以 { 开始，以 } 结束）
+        # 使用正则表达式找到最外层的JSON对象
+        json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
+        matches = re.finditer(json_pattern, text, re.DOTALL)
+        
+        for match in matches:
+            json_str = match.group(0)
+            try:
+                data = json.loads(json_str)
+                # 验证是否包含预期的字段（至少有一个输出字段的key）
+                if isinstance(data, dict) and len(data) > 0:
+                    return data
+            except json.JSONDecodeError:
+                continue
+        
+        # 方法4: 尝试查找嵌套的JSON对象（更复杂的匹配）
+        # 找到第一个 { 和最后一个匹配的 }
+        start_idx = text.find('{')
+        if start_idx != -1:
+            brace_count = 0
+            end_idx = start_idx
+            for i in range(start_idx, len(text)):
+                if text[i] == '{':
+                    brace_count += 1
+                elif text[i] == '}':
+                    brace_count -= 1
+                    if brace_count == 0:
+                        end_idx = i
+                        break
+            
+            if end_idx > start_idx:
+                json_str = text[start_idx:end_idx + 1]
+                try:
+                    return json.loads(json_str)
+                except json.JSONDecodeError:
+                    pass
+        
+        return None
+    
     def _parse_text_response(self, text: str, output_fields: List[Dict]) -> Dict:
         """
         从文本响应中解析字段值（备用方案）