413 lines
13 KiB
Python
413 lines
13 KiB
Python
"""
|
||
数据资产盘点报告生成服务
|
||
"""
|
||
import json
|
||
import time
|
||
from typing import Dict, Any, List
|
||
from app.schemas.delivery import (
|
||
GenerateReportRequest,
|
||
GenerateReportResponse,
|
||
ProjectInfo,
|
||
InventoryData,
|
||
ContextData,
|
||
ValueData,
|
||
)
|
||
from app.utils.llm_client import llm_client
|
||
from app.utils.logger import logger
|
||
from app.core.config import settings
|
||
from app.core.exceptions import LLMAPIException, ValidationException
|
||
|
||
|
||
# ==================== 提示词模板 ====================
|
||
|
||
SYSTEM_PROMPT = """你是一位专业的数据资产管理咨询专家,擅长撰写数据资产盘点工作总结报告。你的任务是基于提供的数据盘点结果、企业背景信息和价值挖掘场景,生成一份专业、准确、符合数据合规要求的工作总结报告。
|
||
|
||
## 你的专业能力
|
||
- 深入理解数据资产管理、数据合规(PIPL、数据安全法)等法规要求
|
||
- 熟悉企业数字化转型、数据架构设计、数据治理最佳实践
|
||
- 能够识别数据资产价值、合规风险,并提供专业建议
|
||
- 具备优秀的报告撰写能力,能够生成结构清晰、逻辑严谨的专业报告
|
||
|
||
## 输出要求
|
||
1. **准确性**:所有统计数据必须基于输入数据,不得虚构
|
||
2. **专业性**:使用专业术语,符合行业标准
|
||
3. **合规性**:合规风险分析必须符合中国数据保护法规要求
|
||
4. **可操作性**:专家建议必须具体、可执行
|
||
5. **结构化**:严格按照JSON格式输出,确保数据结构完整
|
||
"""
|
||
|
||
|
||
def build_section1_2_prompt(
|
||
project_info: ProjectInfo,
|
||
inventory_data: InventoryData,
|
||
context_data: ContextData
|
||
) -> str:
|
||
"""构建章节一和章节二的提示词"""
|
||
# 格式化存储分布(用于JSON部分)
|
||
storage_distribution_json = ",\n ".join([
|
||
f'{{"category": "{item.category}", "volume": "{item.volume}", "storage_type": "{item.storage_type}", "color": "{item.color}"}}'
|
||
for item in inventory_data.storage_distribution
|
||
])
|
||
|
||
return f"""请基于以下信息生成报告的前两个章节:
|
||
|
||
## 项目信息
|
||
- 项目名称:{project_info.project_name}
|
||
- 行业类型:{project_info.industry}
|
||
- 企业名称:{project_info.company_name or '未提供'}
|
||
|
||
## 数据盘点结果
|
||
### 数据规模
|
||
- 总数据量:{inventory_data.total_data_volume}
|
||
- 数据表数量:{inventory_data.total_tables} 张
|
||
- 字段数量:{inventory_data.total_fields} 个
|
||
|
||
### 存储分布
|
||
{chr(10).join(f"- {item.category}:{item.volume}({item.storage_type})" for item in inventory_data.storage_distribution)}
|
||
|
||
### 数据来源结构
|
||
- 结构化数据:{inventory_data.data_source_structure.structured}%
|
||
- 半结构化与非结构化数据:{inventory_data.data_source_structure.semi_structured}%
|
||
|
||
## 企业背景信息
|
||
{context_data.enterprise_background}
|
||
|
||
## 信息化建设现状
|
||
{context_data.informatization_status}
|
||
|
||
## 业务流与数据流
|
||
{context_data.business_flow}
|
||
|
||
## 输出要求
|
||
1. 生成章节一:企业数字化情况简介
|
||
- 企业背景描述(1-2段,不少于100字)
|
||
- 信息化建设现状(概述、私有云、公有云)
|
||
- 业务流与数据流(概述、制造、物流、零售、数据聚合)
|
||
|
||
2. 生成章节二:数据资源统计
|
||
- 数据总量统计
|
||
- 存储分布(使用输入数据)
|
||
- 数据来源结构(使用输入数据,确保百分比总和为100%)
|
||
|
||
请以JSON格式输出,严格按照以下结构:
|
||
{{
|
||
"section1": {{
|
||
"enterprise_background": {{"description": "企业背景描述"}},
|
||
"informatization_status": {{
|
||
"overview": "概述",
|
||
"private_cloud": {{"title": "私有云", "description": "描述"}},
|
||
"public_cloud": {{"title": "公有云", "description": "描述"}}
|
||
}},
|
||
"business_data_flow": {{
|
||
"overview": "概述",
|
||
"manufacturing": {{"title": "制造", "description": "描述"}},
|
||
"logistics": {{"title": "物流", "description": "描述"}},
|
||
"retail": {{"title": "零售", "description": "描述"}},
|
||
"data_aggregation": {{"title": "数据聚合", "description": "描述"}}
|
||
}}
|
||
}},
|
||
"section2": {{
|
||
"summary": {{
|
||
"total_data_volume": "{inventory_data.total_data_volume}",
|
||
"total_data_objects": {{
|
||
"tables": "{inventory_data.total_tables} 张表",
|
||
"fields": "{inventory_data.total_fields} 个字段"
|
||
}}
|
||
}},
|
||
"storage_distribution": [
|
||
{storage_distribution_json}
|
||
],
|
||
"data_source_structure": {{
|
||
"structured": {inventory_data.data_source_structure.structured},
|
||
"semi_structured": {inventory_data.data_source_structure.semi_structured}
|
||
}}
|
||
}}
|
||
}}
|
||
"""
|
||
|
||
|
||
def build_section3_prompt(
|
||
inventory_data: InventoryData,
|
||
section1_data: Dict,
|
||
section2_data: Dict
|
||
) -> str:
|
||
"""构建章节三的提示词"""
|
||
assets_info = "\n".join([
|
||
f"- {asset.name}:{asset.description}\n 核心表:{', '.join(asset.core_tables)}"
|
||
for asset in inventory_data.identified_assets
|
||
])
|
||
|
||
return f"""基于已识别的数据资产,生成详细的资产盘点分析。
|
||
|
||
## 识别的数据资产
|
||
{assets_info}
|
||
|
||
## 输出要求
|
||
对于每个数据资产,需要:
|
||
1. 详细描述资产构成(核心表、字段、数据来源)
|
||
2. 说明应用场景和价值
|
||
3. 识别合规风险(必须符合PIPL、数据安全法等要求)
|
||
4. 提供风险等级评估
|
||
|
||
合规风险必须识别:
|
||
- 个人信息(SPI)风险
|
||
- 重要数据风险
|
||
- 数据出境风险
|
||
- 数据安全风险
|
||
|
||
请以JSON格式输出:
|
||
{{
|
||
"section3": {{
|
||
"overview": {{
|
||
"asset_count": {len(inventory_data.identified_assets)},
|
||
"high_value_assets": {[asset.name for asset in inventory_data.identified_assets]},
|
||
"description": "概述描述"
|
||
}},
|
||
"assets": [
|
||
{{
|
||
"id": "asset_id",
|
||
"title": "资产标题",
|
||
"subtitle": "英文名称",
|
||
"composition": {{
|
||
"description": "构成描述",
|
||
"core_tables": ["表1", "表2"]
|
||
}},
|
||
"application_scenarios": {{
|
||
"description": "应用场景描述"
|
||
}},
|
||
"compliance_risks": {{
|
||
"warnings": [
|
||
{{
|
||
"type": "个人信息预警",
|
||
"content": "风险描述",
|
||
"highlights": ["高亮信息"]
|
||
}}
|
||
]
|
||
}}
|
||
}}
|
||
]
|
||
}}
|
||
}}
|
||
"""
|
||
|
||
|
||
def build_section4_prompt(
|
||
section1_data: Dict,
|
||
section2_data: Dict,
|
||
section3_data: Dict,
|
||
value_data: ValueData
|
||
) -> str:
|
||
"""构建章节四的提示词"""
|
||
scenarios_info = "\n".join([
|
||
f"- {scenario.name}:{scenario.description}"
|
||
for scenario in value_data.selected_scenarios
|
||
])
|
||
|
||
# 提取资产信息
|
||
assets = section3_data.get("assets", [])
|
||
asset_names = [asset.get("title", "") for asset in assets]
|
||
|
||
# 提取合规风险
|
||
risks = []
|
||
for asset in assets:
|
||
warnings = asset.get("compliance_risks", {}).get("warnings", [])
|
||
risks.extend([w.get("content", "") for w in warnings])
|
||
|
||
return f"""基于前面章节的分析结果,生成专家建议和下一步计划。
|
||
|
||
## 识别的数据资产
|
||
{', '.join(asset_names) if asset_names else '无'}
|
||
|
||
## 合规风险汇总
|
||
{chr(10).join(f"- {risk}" for risk in risks[:5]) if risks else '无重大合规风险'}
|
||
|
||
## 价值挖掘场景
|
||
{scenarios_info}
|
||
|
||
## 输出要求
|
||
建议需要:
|
||
1. 针对识别出的合规风险提供整改方案
|
||
2. 提供技术演进建议(架构优化、技术选型)
|
||
3. 提供价值深化建议(场景优化、数据应用)
|
||
|
||
请以JSON格式输出:
|
||
{{
|
||
"section4": {{
|
||
"compliance_remediation": {{
|
||
"title": "合规整改",
|
||
"items": [
|
||
{{
|
||
"order": 1,
|
||
"category": "分类",
|
||
"description": "详细建议",
|
||
"code_references": ["表名"]
|
||
}}
|
||
]
|
||
}},
|
||
"technical_evolution": {{
|
||
"title": "技术演进",
|
||
"description": "技术建议描述",
|
||
"technologies": ["技术1", "技术2"]
|
||
}},
|
||
"value_deepening": {{
|
||
"title": "价值深化",
|
||
"items": [
|
||
{{
|
||
"description": "建议描述",
|
||
"scenarios": ["相关场景"]
|
||
}}
|
||
]
|
||
}}
|
||
}}
|
||
}}
|
||
"""
|
||
|
||
|
||
# ==================== 数据验证 ====================
|
||
|
||
def validate_section2_data(section2_data: Dict, inventory_data: InventoryData) -> None:
|
||
"""验证章节二数据"""
|
||
structured = section2_data.get("data_source_structure", {}).get("structured", 0)
|
||
semi_structured = section2_data.get("data_source_structure", {}).get("semi_structured", 0)
|
||
|
||
if structured + semi_structured != 100:
|
||
raise ValidationException(
|
||
f"数据来源结构百分比总和必须为100%,当前为 {structured + semi_structured}%"
|
||
)
|
||
|
||
|
||
def validate_section3_data(section3_data: Dict) -> None:
|
||
"""验证章节三数据"""
|
||
assets = section3_data.get("assets", [])
|
||
|
||
if not assets:
|
||
raise ValidationException("必须至少包含一个数据资产")
|
||
|
||
for idx, asset in enumerate(assets):
|
||
warnings = asset.get("compliance_risks", {}).get("warnings", [])
|
||
if not warnings:
|
||
logger.warning(f"资产 {asset.get('title', idx + 1)} 缺少合规风险分析")
|
||
|
||
|
||
# ==================== 主要服务类 ====================
|
||
|
||
class ReportGenerationService:
|
||
"""报告生成服务"""
|
||
|
||
@staticmethod
|
||
async def generate(request: GenerateReportRequest) -> Dict[str, Any]:
|
||
"""
|
||
生成数据资产盘点报告
|
||
|
||
Args:
|
||
request: 报告生成请求
|
||
|
||
Returns:
|
||
报告生成结果
|
||
"""
|
||
start_time = time.time()
|
||
|
||
logger.info(
|
||
f"开始生成报告 - 项目: {request.project_info.project_name}, "
|
||
f"资产数: {len(request.inventory_data.identified_assets)}"
|
||
)
|
||
|
||
try:
|
||
# 获取配置
|
||
model = settings.DEFAULT_LLM_MODEL
|
||
temperature = settings.DEFAULT_TEMPERATURE
|
||
|
||
# 阶段一:生成章节一和章节二
|
||
logger.info("生成章节一和章节二...")
|
||
prompt_1_2 = build_section1_2_prompt(
|
||
request.project_info,
|
||
request.inventory_data,
|
||
request.context_data
|
||
)
|
||
|
||
response_1_2 = await llm_client.call(
|
||
prompt=prompt_1_2,
|
||
system_prompt=SYSTEM_PROMPT,
|
||
temperature=temperature,
|
||
model=model
|
||
)
|
||
|
||
result_1_2 = llm_client.parse_json_response(response_1_2)
|
||
|
||
# 验证章节二数据
|
||
validate_section2_data(result_1_2.get("section2", {}), request.inventory_data)
|
||
|
||
logger.info("章节一和章节二生成成功")
|
||
|
||
# 阶段二:生成章节三
|
||
logger.info("生成章节三...")
|
||
prompt_3 = build_section3_prompt(
|
||
request.inventory_data,
|
||
result_1_2.get("section1", {}),
|
||
result_1_2.get("section2", {})
|
||
)
|
||
|
||
response_3 = await llm_client.call(
|
||
prompt=prompt_3,
|
||
system_prompt=SYSTEM_PROMPT,
|
||
temperature=temperature,
|
||
model=model
|
||
)
|
||
|
||
result_3 = llm_client.parse_json_response(response_3)
|
||
|
||
# 验证章节三数据
|
||
validate_section3_data(result_3.get("section3", {}))
|
||
|
||
logger.info("章节三生成成功")
|
||
|
||
# 阶段三:生成章节四
|
||
logger.info("生成章节四...")
|
||
prompt_4 = build_section4_prompt(
|
||
result_1_2.get("section1", {}),
|
||
result_1_2.get("section2", {}),
|
||
result_3.get("section3", {}),
|
||
request.value_data
|
||
)
|
||
|
||
response_4 = await llm_client.call(
|
||
prompt=prompt_4,
|
||
system_prompt=SYSTEM_PROMPT,
|
||
temperature=temperature,
|
||
model=model
|
||
)
|
||
|
||
result_4 = llm_client.parse_json_response(response_4)
|
||
|
||
logger.info("章节四生成成功")
|
||
|
||
# 构建完整响应
|
||
generation_time = time.time() - start_time
|
||
|
||
response_data = {
|
||
"header": {
|
||
"project_name": request.project_info.project_name
|
||
},
|
||
"section1": result_1_2.get("section1", {}),
|
||
"section2": result_1_2.get("section2", {}),
|
||
"section3": result_3.get("section3", {}),
|
||
"section4": result_4.get("section4", {}),
|
||
"generation_time": round(generation_time, 2),
|
||
"model_used": model
|
||
}
|
||
|
||
logger.info(
|
||
f"报告生成完成 - 耗时: {generation_time:.2f}秒, "
|
||
f"资产数: {len(request.inventory_data.identified_assets)}"
|
||
)
|
||
|
||
return response_data
|
||
|
||
except Exception as e:
|
||
logger.exception(f"报告生成失败: {str(e)}")
|
||
raise LLMAPIException(
|
||
f"报告生成失败: {str(e)}",
|
||
error_detail=str(e),
|
||
retryable="Rate limit" in str(e) or "timeout" in str(e).lower()
|
||
)
|