finyx_data_ai/app/services/report_generation_service.py
2026-01-11 07:48:19 +08:00

413 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
数据资产盘点报告生成服务
"""
import json
import time
from typing import Dict, Any, List
from app.schemas.delivery import (
GenerateReportRequest,
GenerateReportResponse,
ProjectInfo,
InventoryData,
ContextData,
ValueData,
)
from app.utils.llm_client import llm_client
from app.utils.logger import logger
from app.core.config import settings
from app.core.exceptions import LLMAPIException, ValidationException
# ==================== 提示词模板 ====================
SYSTEM_PROMPT = """你是一位专业的数据资产管理咨询专家,擅长撰写数据资产盘点工作总结报告。你的任务是基于提供的数据盘点结果、企业背景信息和价值挖掘场景,生成一份专业、准确、符合数据合规要求的工作总结报告。
## 你的专业能力
- 深入理解数据资产管理、数据合规PIPL、数据安全法等法规要求
- 熟悉企业数字化转型、数据架构设计、数据治理最佳实践
- 能够识别数据资产价值、合规风险,并提供专业建议
- 具备优秀的报告撰写能力,能够生成结构清晰、逻辑严谨的专业报告
## 输出要求
1. **准确性**:所有统计数据必须基于输入数据,不得虚构
2. **专业性**:使用专业术语,符合行业标准
3. **合规性**:合规风险分析必须符合中国数据保护法规要求
4. **可操作性**:专家建议必须具体、可执行
5. **结构化**严格按照JSON格式输出确保数据结构完整
"""
def build_section1_2_prompt(
project_info: ProjectInfo,
inventory_data: InventoryData,
context_data: ContextData
) -> str:
"""构建章节一和章节二的提示词"""
# 格式化存储分布用于JSON部分
storage_distribution_json = ",\n ".join([
f'{{"category": "{item.category}", "volume": "{item.volume}", "storage_type": "{item.storage_type}", "color": "{item.color}"}}'
for item in inventory_data.storage_distribution
])
return f"""请基于以下信息生成报告的前两个章节:
## 项目信息
- 项目名称:{project_info.project_name}
- 行业类型:{project_info.industry}
- 企业名称:{project_info.company_name or '未提供'}
## 数据盘点结果
### 数据规模
- 总数据量:{inventory_data.total_data_volume}
- 数据表数量:{inventory_data.total_tables}
- 字段数量:{inventory_data.total_fields}
### 存储分布
{chr(10).join(f"- {item.category}{item.volume}{item.storage_type}" for item in inventory_data.storage_distribution)}
### 数据来源结构
- 结构化数据:{inventory_data.data_source_structure.structured}%
- 半结构化与非结构化数据:{inventory_data.data_source_structure.semi_structured}%
## 企业背景信息
{context_data.enterprise_background}
## 信息化建设现状
{context_data.informatization_status}
## 业务流与数据流
{context_data.business_flow}
## 输出要求
1. 生成章节一:企业数字化情况简介
- 企业背景描述1-2段不少于100字
- 信息化建设现状(概述、私有云、公有云)
- 业务流与数据流(概述、制造、物流、零售、数据聚合)
2. 生成章节二:数据资源统计
- 数据总量统计
- 存储分布(使用输入数据)
- 数据来源结构使用输入数据确保百分比总和为100%
请以JSON格式输出严格按照以下结构
{{
"section1": {{
"enterprise_background": {{"description": "企业背景描述"}},
"informatization_status": {{
"overview": "概述",
"private_cloud": {{"title": "私有云", "description": "描述"}},
"public_cloud": {{"title": "公有云", "description": "描述"}}
}},
"business_data_flow": {{
"overview": "概述",
"manufacturing": {{"title": "制造", "description": "描述"}},
"logistics": {{"title": "物流", "description": "描述"}},
"retail": {{"title": "零售", "description": "描述"}},
"data_aggregation": {{"title": "数据聚合", "description": "描述"}}
}}
}},
"section2": {{
"summary": {{
"total_data_volume": "{inventory_data.total_data_volume}",
"total_data_objects": {{
"tables": "{inventory_data.total_tables} 张表",
"fields": "{inventory_data.total_fields} 个字段"
}}
}},
"storage_distribution": [
{storage_distribution_json}
],
"data_source_structure": {{
"structured": {inventory_data.data_source_structure.structured},
"semi_structured": {inventory_data.data_source_structure.semi_structured}
}}
}}
}}
"""
def build_section3_prompt(
inventory_data: InventoryData,
section1_data: Dict,
section2_data: Dict
) -> str:
"""构建章节三的提示词"""
assets_info = "\n".join([
f"- {asset.name}{asset.description}\n 核心表:{', '.join(asset.core_tables)}"
for asset in inventory_data.identified_assets
])
return f"""基于已识别的数据资产,生成详细的资产盘点分析。
## 识别的数据资产
{assets_info}
## 输出要求
对于每个数据资产,需要:
1. 详细描述资产构成(核心表、字段、数据来源)
2. 说明应用场景和价值
3. 识别合规风险必须符合PIPL、数据安全法等要求
4. 提供风险等级评估
合规风险必须识别:
- 个人信息SPI风险
- 重要数据风险
- 数据出境风险
- 数据安全风险
请以JSON格式输出
{{
"section3": {{
"overview": {{
"asset_count": {len(inventory_data.identified_assets)},
"high_value_assets": {[asset.name for asset in inventory_data.identified_assets]},
"description": "概述描述"
}},
"assets": [
{{
"id": "asset_id",
"title": "资产标题",
"subtitle": "英文名称",
"composition": {{
"description": "构成描述",
"core_tables": ["表1", "表2"]
}},
"application_scenarios": {{
"description": "应用场景描述"
}},
"compliance_risks": {{
"warnings": [
{{
"type": "个人信息预警",
"content": "风险描述",
"highlights": ["高亮信息"]
}}
]
}}
}}
]
}}
}}
"""
def build_section4_prompt(
section1_data: Dict,
section2_data: Dict,
section3_data: Dict,
value_data: ValueData
) -> str:
"""构建章节四的提示词"""
scenarios_info = "\n".join([
f"- {scenario.name}{scenario.description}"
for scenario in value_data.selected_scenarios
])
# 提取资产信息
assets = section3_data.get("assets", [])
asset_names = [asset.get("title", "") for asset in assets]
# 提取合规风险
risks = []
for asset in assets:
warnings = asset.get("compliance_risks", {}).get("warnings", [])
risks.extend([w.get("content", "") for w in warnings])
return f"""基于前面章节的分析结果,生成专家建议和下一步计划。
## 识别的数据资产
{', '.join(asset_names) if asset_names else ''}
## 合规风险汇总
{chr(10).join(f"- {risk}" for risk in risks[:5]) if risks else '无重大合规风险'}
## 价值挖掘场景
{scenarios_info}
## 输出要求
建议需要:
1. 针对识别出的合规风险提供整改方案
2. 提供技术演进建议(架构优化、技术选型)
3. 提供价值深化建议(场景优化、数据应用)
请以JSON格式输出
{{
"section4": {{
"compliance_remediation": {{
"title": "合规整改",
"items": [
{{
"order": 1,
"category": "分类",
"description": "详细建议",
"code_references": ["表名"]
}}
]
}},
"technical_evolution": {{
"title": "技术演进",
"description": "技术建议描述",
"technologies": ["技术1", "技术2"]
}},
"value_deepening": {{
"title": "价值深化",
"items": [
{{
"description": "建议描述",
"scenarios": ["相关场景"]
}}
]
}}
}}
}}
"""
# ==================== 数据验证 ====================
def validate_section2_data(section2_data: Dict, inventory_data: InventoryData) -> None:
"""验证章节二数据"""
structured = section2_data.get("data_source_structure", {}).get("structured", 0)
semi_structured = section2_data.get("data_source_structure", {}).get("semi_structured", 0)
if structured + semi_structured != 100:
raise ValidationException(
f"数据来源结构百分比总和必须为100%,当前为 {structured + semi_structured}%"
)
def validate_section3_data(section3_data: Dict) -> None:
"""验证章节三数据"""
assets = section3_data.get("assets", [])
if not assets:
raise ValidationException("必须至少包含一个数据资产")
for idx, asset in enumerate(assets):
warnings = asset.get("compliance_risks", {}).get("warnings", [])
if not warnings:
logger.warning(f"资产 {asset.get('title', idx + 1)} 缺少合规风险分析")
# ==================== 主要服务类 ====================
class ReportGenerationService:
"""报告生成服务"""
@staticmethod
async def generate(request: GenerateReportRequest) -> Dict[str, Any]:
"""
生成数据资产盘点报告
Args:
request: 报告生成请求
Returns:
报告生成结果
"""
start_time = time.time()
logger.info(
f"开始生成报告 - 项目: {request.project_info.project_name}, "
f"资产数: {len(request.inventory_data.identified_assets)}"
)
try:
# 获取配置
model = settings.DEFAULT_LLM_MODEL
temperature = settings.DEFAULT_TEMPERATURE
# 阶段一:生成章节一和章节二
logger.info("生成章节一和章节二...")
prompt_1_2 = build_section1_2_prompt(
request.project_info,
request.inventory_data,
request.context_data
)
response_1_2 = await llm_client.call(
prompt=prompt_1_2,
system_prompt=SYSTEM_PROMPT,
temperature=temperature,
model=model
)
result_1_2 = llm_client.parse_json_response(response_1_2)
# 验证章节二数据
validate_section2_data(result_1_2.get("section2", {}), request.inventory_data)
logger.info("章节一和章节二生成成功")
# 阶段二:生成章节三
logger.info("生成章节三...")
prompt_3 = build_section3_prompt(
request.inventory_data,
result_1_2.get("section1", {}),
result_1_2.get("section2", {})
)
response_3 = await llm_client.call(
prompt=prompt_3,
system_prompt=SYSTEM_PROMPT,
temperature=temperature,
model=model
)
result_3 = llm_client.parse_json_response(response_3)
# 验证章节三数据
validate_section3_data(result_3.get("section3", {}))
logger.info("章节三生成成功")
# 阶段三:生成章节四
logger.info("生成章节四...")
prompt_4 = build_section4_prompt(
result_1_2.get("section1", {}),
result_1_2.get("section2", {}),
result_3.get("section3", {}),
request.value_data
)
response_4 = await llm_client.call(
prompt=prompt_4,
system_prompt=SYSTEM_PROMPT,
temperature=temperature,
model=model
)
result_4 = llm_client.parse_json_response(response_4)
logger.info("章节四生成成功")
# 构建完整响应
generation_time = time.time() - start_time
response_data = {
"header": {
"project_name": request.project_info.project_name
},
"section1": result_1_2.get("section1", {}),
"section2": result_1_2.get("section2", {}),
"section3": result_3.get("section3", {}),
"section4": result_4.get("section4", {}),
"generation_time": round(generation_time, 2),
"model_used": model
}
logger.info(
f"报告生成完成 - 耗时: {generation_time:.2f}秒, "
f"资产数: {len(request.inventory_data.identified_assets)}"
)
return response_data
except Exception as e:
logger.exception(f"报告生成失败: {str(e)}")
raise LLMAPIException(
f"报告生成失败: {str(e)}",
error_detail=str(e),
retryable="Rate limit" in str(e) or "timeout" in str(e).lower()
)