""" 数据资产盘点报告生成服务 """ import json import time from typing import Dict, Any, List from app.schemas.delivery import ( GenerateReportRequest, GenerateReportResponse, ProjectInfo, InventoryData, ContextData, ValueData, ) from app.utils.llm_client import llm_client from app.utils.logger import logger from app.core.config import settings from app.core.exceptions import LLMAPIException, ValidationException # ==================== 提示词模板 ==================== SYSTEM_PROMPT = """你是一位专业的数据资产管理咨询专家,擅长撰写数据资产盘点工作总结报告。你的任务是基于提供的数据盘点结果、企业背景信息和价值挖掘场景,生成一份专业、准确、符合数据合规要求的工作总结报告。 ## 你的专业能力 - 深入理解数据资产管理、数据合规(PIPL、数据安全法)等法规要求 - 熟悉企业数字化转型、数据架构设计、数据治理最佳实践 - 能够识别数据资产价值、合规风险,并提供专业建议 - 具备优秀的报告撰写能力,能够生成结构清晰、逻辑严谨的专业报告 ## 输出要求 1. **准确性**:所有统计数据必须基于输入数据,不得虚构 2. **专业性**:使用专业术语,符合行业标准 3. **合规性**:合规风险分析必须符合中国数据保护法规要求 4. **可操作性**:专家建议必须具体、可执行 5. **结构化**:严格按照JSON格式输出,确保数据结构完整 """ def build_section1_2_prompt( project_info: ProjectInfo, inventory_data: InventoryData, context_data: ContextData ) -> str: """构建章节一和章节二的提示词""" # 格式化存储分布(用于JSON部分) storage_distribution_json = ",\n ".join([ f'{{"category": "{item.category}", "volume": "{item.volume}", "storage_type": "{item.storage_type}", "color": "{item.color}"}}' for item in inventory_data.storage_distribution ]) return f"""请基于以下信息生成报告的前两个章节: ## 项目信息 - 项目名称:{project_info.project_name} - 行业类型:{project_info.industry} - 企业名称:{project_info.company_name or '未提供'} ## 数据盘点结果 ### 数据规模 - 总数据量:{inventory_data.total_data_volume} - 数据表数量:{inventory_data.total_tables} 张 - 字段数量:{inventory_data.total_fields} 个 ### 存储分布 {chr(10).join(f"- {item.category}:{item.volume}({item.storage_type})" for item in inventory_data.storage_distribution)} ### 数据来源结构 - 结构化数据:{inventory_data.data_source_structure.structured}% - 半结构化与非结构化数据:{inventory_data.data_source_structure.semi_structured}% ## 企业背景信息 {context_data.enterprise_background} ## 信息化建设现状 {context_data.informatization_status} ## 业务流与数据流 {context_data.business_flow} ## 输出要求 1. 生成章节一:企业数字化情况简介 - 企业背景描述(1-2段,不少于100字) - 信息化建设现状(概述、私有云、公有云) - 业务流与数据流(概述、制造、物流、零售、数据聚合) 2. 生成章节二:数据资源统计 - 数据总量统计 - 存储分布(使用输入数据) - 数据来源结构(使用输入数据,确保百分比总和为100%) 请以JSON格式输出,严格按照以下结构: {{ "section1": {{ "enterprise_background": {{"description": "企业背景描述"}}, "informatization_status": {{ "overview": "概述", "private_cloud": {{"title": "私有云", "description": "描述"}}, "public_cloud": {{"title": "公有云", "description": "描述"}} }}, "business_data_flow": {{ "overview": "概述", "manufacturing": {{"title": "制造", "description": "描述"}}, "logistics": {{"title": "物流", "description": "描述"}}, "retail": {{"title": "零售", "description": "描述"}}, "data_aggregation": {{"title": "数据聚合", "description": "描述"}} }} }}, "section2": {{ "summary": {{ "total_data_volume": "{inventory_data.total_data_volume}", "total_data_objects": {{ "tables": "{inventory_data.total_tables} 张表", "fields": "{inventory_data.total_fields} 个字段" }} }}, "storage_distribution": [ {storage_distribution_json} ], "data_source_structure": {{ "structured": {inventory_data.data_source_structure.structured}, "semi_structured": {inventory_data.data_source_structure.semi_structured} }} }} }} """ def build_section3_prompt( inventory_data: InventoryData, section1_data: Dict, section2_data: Dict ) -> str: """构建章节三的提示词""" assets_info = "\n".join([ f"- {asset.name}:{asset.description}\n 核心表:{', '.join(asset.core_tables)}" for asset in inventory_data.identified_assets ]) return f"""基于已识别的数据资产,生成详细的资产盘点分析。 ## 识别的数据资产 {assets_info} ## 输出要求 对于每个数据资产,需要: 1. 详细描述资产构成(核心表、字段、数据来源) 2. 说明应用场景和价值 3. 识别合规风险(必须符合PIPL、数据安全法等要求) 4. 提供风险等级评估 合规风险必须识别: - 个人信息(SPI)风险 - 重要数据风险 - 数据出境风险 - 数据安全风险 请以JSON格式输出: {{ "section3": {{ "overview": {{ "asset_count": {len(inventory_data.identified_assets)}, "high_value_assets": {[asset.name for asset in inventory_data.identified_assets]}, "description": "概述描述" }}, "assets": [ {{ "id": "asset_id", "title": "资产标题", "subtitle": "英文名称", "composition": {{ "description": "构成描述", "core_tables": ["表1", "表2"] }}, "application_scenarios": {{ "description": "应用场景描述" }}, "compliance_risks": {{ "warnings": [ {{ "type": "个人信息预警", "content": "风险描述", "highlights": ["高亮信息"] }} ] }} }} ] }} }} """ def build_section4_prompt( section1_data: Dict, section2_data: Dict, section3_data: Dict, value_data: ValueData ) -> str: """构建章节四的提示词""" scenarios_info = "\n".join([ f"- {scenario.name}:{scenario.description}" for scenario in value_data.selected_scenarios ]) # 提取资产信息 assets = section3_data.get("assets", []) asset_names = [asset.get("title", "") for asset in assets] # 提取合规风险 risks = [] for asset in assets: warnings = asset.get("compliance_risks", {}).get("warnings", []) risks.extend([w.get("content", "") for w in warnings]) return f"""基于前面章节的分析结果,生成专家建议和下一步计划。 ## 识别的数据资产 {', '.join(asset_names) if asset_names else '无'} ## 合规风险汇总 {chr(10).join(f"- {risk}" for risk in risks[:5]) if risks else '无重大合规风险'} ## 价值挖掘场景 {scenarios_info} ## 输出要求 建议需要: 1. 针对识别出的合规风险提供整改方案 2. 提供技术演进建议(架构优化、技术选型) 3. 提供价值深化建议(场景优化、数据应用) 请以JSON格式输出: {{ "section4": {{ "compliance_remediation": {{ "title": "合规整改", "items": [ {{ "order": 1, "category": "分类", "description": "详细建议", "code_references": ["表名"] }} ] }}, "technical_evolution": {{ "title": "技术演进", "description": "技术建议描述", "technologies": ["技术1", "技术2"] }}, "value_deepening": {{ "title": "价值深化", "items": [ {{ "description": "建议描述", "scenarios": ["相关场景"] }} ] }} }} }} """ # ==================== 数据验证 ==================== def validate_section2_data(section2_data: Dict, inventory_data: InventoryData) -> None: """验证章节二数据""" structured = section2_data.get("data_source_structure", {}).get("structured", 0) semi_structured = section2_data.get("data_source_structure", {}).get("semi_structured", 0) if structured + semi_structured != 100: raise ValidationException( f"数据来源结构百分比总和必须为100%,当前为 {structured + semi_structured}%" ) def validate_section3_data(section3_data: Dict) -> None: """验证章节三数据""" assets = section3_data.get("assets", []) if not assets: raise ValidationException("必须至少包含一个数据资产") for idx, asset in enumerate(assets): warnings = asset.get("compliance_risks", {}).get("warnings", []) if not warnings: logger.warning(f"资产 {asset.get('title', idx + 1)} 缺少合规风险分析") # ==================== 主要服务类 ==================== class ReportGenerationService: """报告生成服务""" @staticmethod async def generate(request: GenerateReportRequest) -> Dict[str, Any]: """ 生成数据资产盘点报告 Args: request: 报告生成请求 Returns: 报告生成结果 """ start_time = time.time() logger.info( f"开始生成报告 - 项目: {request.project_info.project_name}, " f"资产数: {len(request.inventory_data.identified_assets)}" ) try: # 获取配置 model = settings.DEFAULT_LLM_MODEL temperature = settings.DEFAULT_TEMPERATURE # 阶段一:生成章节一和章节二 logger.info("生成章节一和章节二...") prompt_1_2 = build_section1_2_prompt( request.project_info, request.inventory_data, request.context_data ) response_1_2 = await llm_client.call( prompt=prompt_1_2, system_prompt=SYSTEM_PROMPT, temperature=temperature, model=model ) result_1_2 = llm_client.parse_json_response(response_1_2) # 验证章节二数据 validate_section2_data(result_1_2.get("section2", {}), request.inventory_data) logger.info("章节一和章节二生成成功") # 阶段二:生成章节三 logger.info("生成章节三...") prompt_3 = build_section3_prompt( request.inventory_data, result_1_2.get("section1", {}), result_1_2.get("section2", {}) ) response_3 = await llm_client.call( prompt=prompt_3, system_prompt=SYSTEM_PROMPT, temperature=temperature, model=model ) result_3 = llm_client.parse_json_response(response_3) # 验证章节三数据 validate_section3_data(result_3.get("section3", {})) logger.info("章节三生成成功") # 阶段三:生成章节四 logger.info("生成章节四...") prompt_4 = build_section4_prompt( result_1_2.get("section1", {}), result_1_2.get("section2", {}), result_3.get("section3", {}), request.value_data ) response_4 = await llm_client.call( prompt=prompt_4, system_prompt=SYSTEM_PROMPT, temperature=temperature, model=model ) result_4 = llm_client.parse_json_response(response_4) logger.info("章节四生成成功") # 构建完整响应 generation_time = time.time() - start_time response_data = { "header": { "project_name": request.project_info.project_name }, "section1": result_1_2.get("section1", {}), "section2": result_1_2.get("section2", {}), "section3": result_3.get("section3", {}), "section4": result_4.get("section4", {}), "generation_time": round(generation_time, 2), "model_used": model } logger.info( f"报告生成完成 - 耗时: {generation_time:.2f}秒, " f"资产数: {len(request.inventory_data.identified_assets)}" ) return response_data except Exception as e: logger.exception(f"报告生成失败: {str(e)}") raise LLMAPIException( f"报告生成失败: {str(e)}", error_detail=str(e), retryable="Rate limit" in str(e) or "timeout" in str(e).lower() )