439 lines
15 KiB
Python
439 lines
15 KiB
Python
"""
|
||
数据资产智能识别服务
|
||
"""
|
||
import json
|
||
import time
|
||
from typing import List, Dict, Any, Optional
|
||
from app.schemas.inventory import (
|
||
TableInput,
|
||
FieldInput,
|
||
TableOutput,
|
||
FieldOutput,
|
||
Statistics,
|
||
TokenUsage,
|
||
AnalyzeOptions,
|
||
)
|
||
from app.utils.llm_client import llm_client
|
||
from app.utils.logger import logger
|
||
from app.core.config import settings
|
||
from app.core.exceptions import LLMAPIException, ValidationException
|
||
|
||
|
||
# ==================== 提示词模板 ====================
|
||
|
||
SYSTEM_PROMPT = """你是一位专业的数据资产管理专家,擅长识别数据资产的中文名称、业务含义、敏感信息和重要数据特征。
|
||
|
||
## 你的专业能力
|
||
- 深入理解数据资产管理、数据合规(PIPL、数据安全法)等法规要求
|
||
- 熟悉各种业务场景下的数据资产命名规范
|
||
- 能够准确识别敏感个人信息(SPI)和重要数据
|
||
- 具备优秀的文本理解和生成能力
|
||
|
||
## 输出要求
|
||
1. **准确性**: 中文命名必须准确反映业务含义
|
||
2. **合规性**: PII 识别必须符合《个人信息保护法》(PIPL)
|
||
3. **完整性**: 重要数据识别必须符合《数据安全法》
|
||
4. **专业性**: 使用专业术语,符合行业标准
|
||
5. **结构化**: 严格按照JSON格式输出
|
||
"""
|
||
|
||
USER_PROMPT_TEMPLATE = """请基于以下信息识别数据资产:
|
||
|
||
## 行业背景
|
||
{industry_info}
|
||
|
||
## 业务背景
|
||
{context_info}
|
||
|
||
## 表结构信息
|
||
{tables_info}
|
||
|
||
## 识别要求
|
||
1. 为每个表生成中文名称(ai_name)和业务描述(desc)
|
||
2. 为每个字段生成中文名称(ai_name)和业务描述(desc)
|
||
3. 识别敏感个人信息(PII):
|
||
- 手机号、身份证号、姓名、邮箱、地址等
|
||
- 生物识别信息(人脸、指纹等)
|
||
- 医疗健康信息
|
||
- 金融账户信息
|
||
- 行踪轨迹信息
|
||
4. 识别重要数据(符合《数据安全法》):
|
||
- 涉及国家安全的数据
|
||
- 涉及公共利益的数据
|
||
- 高精度地理信息(军事禁区周边)
|
||
- 关键物资流向(稀土、芯片等)
|
||
5. 计算置信度评分(0-100):
|
||
- 字段命名规范度
|
||
- 注释完整性
|
||
- 业务含义明确度
|
||
|
||
## 输出格式(JSON)
|
||
{json_schema}
|
||
|
||
请严格按照以上JSON Schema格式输出,确保所有字段都存在。
|
||
"""
|
||
|
||
JSON_SCHEMA = """
|
||
{
|
||
"type": "object",
|
||
"required": ["tables"],
|
||
"properties": {
|
||
"tables": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"required": ["raw_name", "ai_name", "desc", "confidence", "fields"],
|
||
"properties": {
|
||
"raw_name": {"type": "string"},
|
||
"ai_name": {"type": "string"},
|
||
"desc": {"type": "string"},
|
||
"confidence": {"type": "integer", "minimum": 0, "maximum": 100},
|
||
"fields": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"required": ["raw_name", "ai_name", "desc", "pii", "pii_type", "is_important_data", "confidence"],
|
||
"properties": {
|
||
"raw_name": {"type": "string"},
|
||
"ai_name": {"type": "string"},
|
||
"desc": {"type": "string"},
|
||
"pii": {"type": "array", "items": {"type": "string"}},
|
||
"pii_type": {"type": ["string", "null"]},
|
||
"is_important_data": {"type": "boolean"},
|
||
"confidence": {"type": "integer", "minimum": 0, "maximum": 100}
|
||
}
|
||
}
|
||
},
|
||
"pii": {"type": "array", "items": {"type": "string"}},
|
||
"important": {"type": "boolean"},
|
||
"important_data_types": {"type": "array", "items": {"type": "string"}}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
"""
|
||
|
||
|
||
# ==================== PII 识别规则引擎 ====================
|
||
|
||
PII_KEYWORDS = {
|
||
"phone": {
|
||
"keywords": ["phone", "mobile", "tel", "telephone", "手机", "电话", "联系方式"],
|
||
"type": "contact",
|
||
"label": "手机号"
|
||
},
|
||
"id_card": {
|
||
"keywords": ["id_card", "idcard", "identity", "身份证", "证件号", "身份证明"],
|
||
"type": "identity",
|
||
"label": "身份证号"
|
||
},
|
||
"name": {
|
||
"keywords": ["name", "real_name", "姓名", "名字", "用户名"],
|
||
"type": "name",
|
||
"label": "姓名"
|
||
},
|
||
"email": {
|
||
"keywords": ["email", "mail", "邮箱", "电子邮箱", "邮件"],
|
||
"type": "email",
|
||
"label": "邮箱"
|
||
},
|
||
"address": {
|
||
"keywords": ["address", "addr", "地址", "住址", "居住地址"],
|
||
"type": "address",
|
||
"label": "地址"
|
||
},
|
||
"bank_card": {
|
||
"keywords": ["bank_card", "card_no", "银行卡", "卡号", "账户"],
|
||
"type": "financial",
|
||
"label": "银行卡号"
|
||
},
|
||
}
|
||
|
||
|
||
def validate_pii_detection(field: FieldOutput, field_input: FieldInput) -> FieldOutput:
|
||
"""
|
||
使用规则引擎验证和补充 PII 识别
|
||
|
||
Args:
|
||
field: AI 识别的字段结果
|
||
field_input: 原始字段输入
|
||
|
||
Returns:
|
||
验证后的字段结果
|
||
"""
|
||
field_name_lower = field.raw_name.lower()
|
||
field_comment_lower = (field_input.comment or "").lower()
|
||
|
||
# 如果 AI 未识别 PII,使用规则引擎识别
|
||
if not field.pii or not field.pii_type:
|
||
for pii_key, pii_info in PII_KEYWORDS.items():
|
||
keywords = pii_info["keywords"]
|
||
# 检查字段名和注释中是否包含关键词
|
||
if any(keyword.lower() in field_name_lower or keyword.lower() in field_comment_lower
|
||
for keyword in keywords):
|
||
if not field.pii:
|
||
field.pii = [pii_info["label"]]
|
||
if not field.pii_type:
|
||
field.pii_type = pii_info["type"]
|
||
break
|
||
|
||
return field
|
||
|
||
|
||
# ==================== 置信度评分算法 ====================
|
||
|
||
def calculate_confidence(field_input: FieldInput, field_output: FieldOutput) -> int:
|
||
"""
|
||
计算字段识别结果的置信度评分
|
||
|
||
Args:
|
||
field_input: 原始字段输入
|
||
field_output: AI 识别的字段结果
|
||
|
||
Returns:
|
||
置信度评分(0-100)
|
||
"""
|
||
score = 50 # 基础分
|
||
|
||
# 命名规范度(30分)
|
||
field_name = field_input.raw_name
|
||
if field_name.islower() and '_' in field_name:
|
||
score += 15 # 蛇形命名
|
||
elif field_name.islower() and field_name.isalnum():
|
||
score += 10 # 小写字母数字
|
||
elif field_name.isalnum():
|
||
score += 5 # 字母数字组合
|
||
|
||
# 注释完整性(20分)
|
||
if field_input.comment and len(field_input.comment.strip()) > 0:
|
||
score += 20
|
||
|
||
# AI 识别结果质量(30分)
|
||
if field_output.ai_name and field_output.ai_name != field_input.raw_name:
|
||
score += 15 # AI 生成了中文名称
|
||
if field_output.desc and len(field_output.desc.strip()) > 0:
|
||
score += 15 # AI 生成了描述
|
||
|
||
return min(score, 100)
|
||
|
||
|
||
# ==================== 提示词构建 ====================
|
||
|
||
def build_prompt(
|
||
tables: List[TableInput],
|
||
industry: Optional[str] = None,
|
||
context: Optional[str] = None
|
||
) -> str:
|
||
"""
|
||
构建大模型提示词
|
||
|
||
Args:
|
||
tables: 表列表
|
||
industry: 行业信息
|
||
context: 业务背景
|
||
|
||
Returns:
|
||
构建好的提示词
|
||
"""
|
||
# 格式化表信息
|
||
tables_info = []
|
||
for table in tables:
|
||
table_info = f"表名: {table.raw_name}\n字段列表:\n"
|
||
for field in table.fields:
|
||
field_info = f" - {field.raw_name} ({field.type})"
|
||
if field.comment:
|
||
field_info += f" - 注释: {field.comment}"
|
||
table_info += field_info + "\n"
|
||
tables_info.append(table_info)
|
||
|
||
tables_info_str = "\n\n".join(tables_info)
|
||
|
||
# 行业信息
|
||
industry_info = industry if industry else "未指定"
|
||
|
||
# 业务背景
|
||
context_info = context if context else "未提供业务背景信息"
|
||
|
||
# 构建用户提示词
|
||
user_prompt = USER_PROMPT_TEMPLATE.format(
|
||
industry_info=industry_info,
|
||
context_info=context_info,
|
||
tables_info=tables_info_str,
|
||
json_schema=JSON_SCHEMA
|
||
)
|
||
|
||
return user_prompt
|
||
|
||
|
||
# ==================== 主要服务类 ====================
|
||
|
||
class AIAnalyzeService:
|
||
"""数据资产智能识别服务"""
|
||
|
||
@staticmethod
|
||
async def analyze(
|
||
tables: List[TableInput],
|
||
project_id: str,
|
||
industry: Optional[str] = None,
|
||
context: Optional[str] = None,
|
||
options: Optional[AnalyzeOptions] = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
执行 AI 分析
|
||
|
||
Args:
|
||
tables: 表列表
|
||
project_id: 项目ID
|
||
industry: 行业信息
|
||
context: 业务背景
|
||
options: 分析选项
|
||
|
||
Returns:
|
||
分析结果字典
|
||
"""
|
||
start_time = time.time()
|
||
|
||
# 获取配置
|
||
analyze_options = options or AnalyzeOptions()
|
||
model = analyze_options.model or settings.DEFAULT_LLM_MODEL
|
||
temperature = analyze_options.temperature or settings.DEFAULT_TEMPERATURE
|
||
enable_pii = analyze_options.enable_pii_detection
|
||
enable_important = analyze_options.enable_important_data_detection
|
||
|
||
logger.info(f"开始 AI 分析 - 项目ID: {project_id}, 表数量: {len(tables)}, 模型: {model}")
|
||
|
||
try:
|
||
# 构建提示词
|
||
prompt = build_prompt(tables, industry, context)
|
||
logger.debug(f"提示词长度: {len(prompt)} 字符")
|
||
|
||
# 调用大模型
|
||
response_text = await llm_client.call(
|
||
prompt=prompt,
|
||
system_prompt=SYSTEM_PROMPT,
|
||
temperature=temperature,
|
||
model=model
|
||
)
|
||
|
||
# 解析结果
|
||
llm_result = llm_client.parse_json_response(response_text)
|
||
logger.info("大模型返回结果解析成功")
|
||
|
||
# 验证和转换结果
|
||
tables_output = []
|
||
total_pii_fields = 0
|
||
total_important_fields = 0
|
||
total_confidence = 0
|
||
total_fields = 0
|
||
|
||
# 验证返回的表数量
|
||
llm_tables = llm_result.get("tables", [])
|
||
if len(llm_tables) != len(tables):
|
||
logger.warning(
|
||
f"返回的表数量不匹配: 期望 {len(tables)}, 实际 {len(llm_tables)}"
|
||
)
|
||
|
||
for idx, (table_result, table_input) in enumerate(
|
||
zip(llm_tables, tables)
|
||
):
|
||
fields_output = []
|
||
table_pii = []
|
||
table_important = False
|
||
table_important_types = []
|
||
|
||
# 处理字段
|
||
llm_fields = table_result.get("fields", [])
|
||
for field_idx, (field_result, field_input) in enumerate(
|
||
zip(llm_fields, table_input.fields)
|
||
):
|
||
field_output = FieldOutput(
|
||
raw_name=field_result.get("raw_name", field_input.raw_name),
|
||
ai_name=field_result.get("ai_name", field_input.raw_name),
|
||
desc=field_result.get("desc", ""),
|
||
type=field_input.type,
|
||
pii=field_result.get("pii", []),
|
||
pii_type=field_result.get("pii_type"),
|
||
is_important_data=field_result.get("is_important_data", False),
|
||
confidence=field_result.get("confidence", 80)
|
||
)
|
||
|
||
# 规则引擎验证和补充 PII 识别
|
||
if enable_pii:
|
||
field_output = validate_pii_detection(field_output, field_input)
|
||
|
||
# 重新计算置信度
|
||
field_output.confidence = calculate_confidence(
|
||
field_input, field_output
|
||
)
|
||
|
||
# 收集 PII 信息
|
||
if field_output.pii:
|
||
table_pii.extend(field_output.pii)
|
||
total_pii_fields += 1
|
||
|
||
# 收集重要数据信息
|
||
if field_output.is_important_data:
|
||
table_important = True
|
||
table_important_types.append(field_output.raw_name)
|
||
total_important_fields += 1
|
||
|
||
fields_output.append(field_output)
|
||
total_confidence += field_output.confidence
|
||
total_fields += 1
|
||
|
||
# 构建表输出
|
||
table_output = TableOutput(
|
||
raw_name=table_result.get("raw_name", table_input.raw_name),
|
||
ai_name=table_result.get("ai_name", table_input.raw_name),
|
||
desc=table_result.get("desc", ""),
|
||
confidence=table_result.get("confidence", 80),
|
||
ai_completed=True,
|
||
fields=fields_output,
|
||
pii=list(set(table_pii)), # 去重
|
||
important=table_important,
|
||
important_data_types=table_important_types
|
||
)
|
||
|
||
tables_output.append(table_output)
|
||
|
||
# 计算统计信息
|
||
avg_confidence = (
|
||
total_confidence / total_fields if total_fields > 0 else 0
|
||
)
|
||
processing_time = time.time() - start_time
|
||
|
||
# 构建响应数据
|
||
response_data = {
|
||
"tables": [table.dict() for table in tables_output],
|
||
"statistics": Statistics(
|
||
total_tables=len(tables_output),
|
||
total_fields=total_fields,
|
||
pii_fields_count=total_pii_fields,
|
||
important_data_fields_count=total_important_fields,
|
||
average_confidence=round(avg_confidence, 2)
|
||
).dict(),
|
||
"processing_time": round(processing_time, 2),
|
||
"model_used": model,
|
||
"token_usage": TokenUsage(
|
||
prompt_tokens=len(prompt) // 4, # 粗略估算
|
||
completion_tokens=len(response_text) // 4,
|
||
total_tokens=(len(prompt) + len(response_text)) // 4
|
||
).dict()
|
||
}
|
||
|
||
logger.info(
|
||
f"AI 分析完成 - 处理时间: {processing_time:.2f}秒, "
|
||
f"识别表数: {len(tables_output)}, PII字段数: {total_pii_fields}"
|
||
)
|
||
|
||
return response_data
|
||
|
||
except Exception as e:
|
||
logger.exception(f"AI 分析失败: {str(e)}")
|
||
raise LLMAPIException(
|
||
f"数据资产识别失败: {str(e)}",
|
||
error_detail=str(e),
|
||
retryable="Rate limit" in str(e) or "timeout" in str(e).lower()
|
||
)
|