finyx_data_ai/app/services/report_generation_service.py

"""
数据资产盘点报告生成服务
"""
import json
import time
from typing import Dict, Any, List
from app.schemas.delivery import (
    GenerateReportRequest,
    GenerateReportResponse,
    ProjectInfo,
    InventoryData,
    ContextData,
    ValueData,
)
from app.utils.llm_client import llm_client
from app.utils.logger import logger
from app.core.config import settings
from app.core.exceptions import LLMAPIException, ValidationException


# ==================== 提示词模板 ====================

SYSTEM_PROMPT = """你是一位专业的数据资产管理咨询专家，擅长撰写数据资产盘点工作总结报告。你的任务是基于提供的数据盘点结果、企业背景信息和价值挖掘场景，生成一份专业、准确、符合数据合规要求的工作总结报告。

## 你的专业能力
- 深入理解数据资产管理、数据合规（PIPL、数据安全法）等法规要求
- 熟悉企业数字化转型、数据架构设计、数据治理最佳实践
- 能够识别数据资产价值、合规风险，并提供专业建议
- 具备优秀的报告撰写能力，能够生成结构清晰、逻辑严谨的专业报告

## 输出要求
1. **准确性**：所有统计数据必须基于输入数据，不得虚构
2. **专业性**：使用专业术语，符合行业标准
3. **合规性**：合规风险分析必须符合中国数据保护法规要求
4. **可操作性**：专家建议必须具体、可执行
5. **结构化**：严格按照JSON格式输出，确保数据结构完整
"""


def build_section1_2_prompt(
    project_info: ProjectInfo,
    inventory_data: InventoryData,
    context_data: ContextData
) -> str:
    """构建章节一和章节二的提示词"""
    # 格式化存储分布（用于JSON部分）
    storage_distribution_json = ",\n      ".join([
        f'{{"category": "{item.category}", "volume": "{item.volume}", "storage_type": "{item.storage_type}", "color": "{item.color}"}}'
        for item in inventory_data.storage_distribution
    ])

    return f"""请基于以下信息生成报告的前两个章节：

## 项目信息
- 项目名称：{project_info.project_name}
- 行业类型：{project_info.industry}
- 企业名称：{project_info.company_name or '未提供'}

## 数据盘点结果
### 数据规模
- 总数据量：{inventory_data.total_data_volume}
- 数据表数量：{inventory_data.total_tables} 张
- 字段数量：{inventory_data.total_fields} 个

### 存储分布
{chr(10).join(f"- {item.category}：{item.volume}（{item.storage_type}）" for item in inventory_data.storage_distribution)}

### 数据来源结构
- 结构化数据：{inventory_data.data_source_structure.structured}%
- 半结构化与非结构化数据：{inventory_data.data_source_structure.semi_structured}%

## 企业背景信息
{context_data.enterprise_background}

## 信息化建设现状
{context_data.informatization_status}

## 业务流与数据流
{context_data.business_flow}

## 输出要求
1. 生成章节一：企业数字化情况简介
   - 企业背景描述（1-2段，不少于100字）
   - 信息化建设现状（概述、私有云、公有云）
   - 业务流与数据流（概述、制造、物流、零售、数据聚合）

2. 生成章节二：数据资源统计
   - 数据总量统计
   - 存储分布（使用输入数据）
   - 数据来源结构（使用输入数据，确保百分比总和为100%）

请以JSON格式输出，严格按照以下结构：
{{
  "section1": {{
    "enterprise_background": {{"description": "企业背景描述"}},
    "informatization_status": {{
      "overview": "概述",
      "private_cloud": {{"title": "私有云", "description": "描述"}},
      "public_cloud": {{"title": "公有云", "description": "描述"}}
    }},
    "business_data_flow": {{
      "overview": "概述",
      "manufacturing": {{"title": "制造", "description": "描述"}},
      "logistics": {{"title": "物流", "description": "描述"}},
      "retail": {{"title": "零售", "description": "描述"}},
      "data_aggregation": {{"title": "数据聚合", "description": "描述"}}
    }}
  }},
  "section2": {{
    "summary": {{
      "total_data_volume": "{inventory_data.total_data_volume}",
      "total_data_objects": {{
        "tables": "{inventory_data.total_tables} 张表",
        "fields": "{inventory_data.total_fields} 个字段"
      }}
    }},
    "storage_distribution": [
      {storage_distribution_json}
    ],
    "data_source_structure": {{
      "structured": {inventory_data.data_source_structure.structured},
      "semi_structured": {inventory_data.data_source_structure.semi_structured}
    }}
  }}
}}
"""


def build_section3_prompt(
    inventory_data: InventoryData,
    section1_data: Dict,
    section2_data: Dict
) -> str:
    """构建章节三的提示词"""
    assets_info = "\n".join([
        f"- {asset.name}：{asset.description}\n  核心表：{', '.join(asset.core_tables)}"
        for asset in inventory_data.identified_assets
    ])

    return f"""基于已识别的数据资产，生成详细的资产盘点分析。

## 识别的数据资产
{assets_info}

## 输出要求
对于每个数据资产，需要：
1. 详细描述资产构成（核心表、字段、数据来源）
2. 说明应用场景和价值
3. 识别合规风险（必须符合PIPL、数据安全法等要求）
4. 提供风险等级评估

合规风险必须识别：
- 个人信息（SPI）风险
- 重要数据风险
- 数据出境风险
- 数据安全风险

请以JSON格式输出：
{{
  "section3": {{
    "overview": {{
      "asset_count": {len(inventory_data.identified_assets)},
      "high_value_assets": {[asset.name for asset in inventory_data.identified_assets]},
      "description": "概述描述"
    }},
    "assets": [
      {{
        "id": "asset_id",
        "title": "资产标题",
        "subtitle": "英文名称",
        "composition": {{
          "description": "构成描述",
          "core_tables": ["表1", "表2"]
        }},
        "application_scenarios": {{
          "description": "应用场景描述"
        }},
        "compliance_risks": {{
          "warnings": [
            {{
              "type": "个人信息预警",
              "content": "风险描述",
              "highlights": ["高亮信息"]
            }}
          ]
        }}
      }}
    ]
  }}
}}
"""


def build_section4_prompt(
    section1_data: Dict,
    section2_data: Dict,
    section3_data: Dict,
    value_data: ValueData
) -> str:
    """构建章节四的提示词"""
    scenarios_info = "\n".join([
        f"- {scenario.name}：{scenario.description}"
        for scenario in value_data.selected_scenarios
    ])

    # 提取资产信息
    assets = section3_data.get("assets", [])
    asset_names = [asset.get("title", "") for asset in assets]

    # 提取合规风险
    risks = []
    for asset in assets:
        warnings = asset.get("compliance_risks", {}).get("warnings", [])
        risks.extend([w.get("content", "") for w in warnings])

    return f"""基于前面章节的分析结果，生成专家建议和下一步计划。

## 识别的数据资产
{', '.join(asset_names) if asset_names else '无'}

## 合规风险汇总
{chr(10).join(f"- {risk}" for risk in risks[:5]) if risks else '无重大合规风险'}

## 价值挖掘场景
{scenarios_info}

## 输出要求
建议需要：
1. 针对识别出的合规风险提供整改方案
2. 提供技术演进建议（架构优化、技术选型）
3. 提供价值深化建议（场景优化、数据应用）

请以JSON格式输出：
{{
  "section4": {{
    "compliance_remediation": {{
      "title": "合规整改",
      "items": [
        {{
          "order": 1,
          "category": "分类",
          "description": "详细建议",
          "code_references": ["表名"]
        }}
      ]
    }},
    "technical_evolution": {{
      "title": "技术演进",
      "description": "技术建议描述",
      "technologies": ["技术1", "技术2"]
    }},
    "value_deepening": {{
      "title": "价值深化",
      "items": [
        {{
          "description": "建议描述",
          "scenarios": ["相关场景"]
        }}
      ]
    }}
  }}
}}
"""


# ==================== 数据验证 ====================

def validate_section2_data(section2_data: Dict, inventory_data: InventoryData) -> None:
    """验证章节二数据"""
    structured = section2_data.get("data_source_structure", {}).get("structured", 0)
    semi_structured = section2_data.get("data_source_structure", {}).get("semi_structured", 0)

    if structured + semi_structured != 100:
        raise ValidationException(
            f"数据来源结构百分比总和必须为100%，当前为 {structured + semi_structured}%"
        )


def validate_section3_data(section3_data: Dict) -> None:
    """验证章节三数据"""
    assets = section3_data.get("assets", [])

    if not assets:
        raise ValidationException("必须至少包含一个数据资产")

    for idx, asset in enumerate(assets):
        warnings = asset.get("compliance_risks", {}).get("warnings", [])
        if not warnings:
            logger.warning(f"资产 {asset.get('title', idx + 1)} 缺少合规风险分析")


# ==================== 主要服务类 ====================

class ReportGenerationService:
    """报告生成服务"""

    @staticmethod
    async def generate(request: GenerateReportRequest) -> Dict[str, Any]:
        """
        生成数据资产盘点报告

        Args:
            request: 报告生成请求

        Returns:
            报告生成结果
        """
        start_time = time.time()

        logger.info(
            f"开始生成报告 - 项目: {request.project_info.project_name}, "
            f"资产数: {len(request.inventory_data.identified_assets)}"
        )

        try:
            # 获取配置
            model = settings.DEFAULT_LLM_MODEL
            temperature = settings.DEFAULT_TEMPERATURE

            # 阶段一：生成章节一和章节二
            logger.info("生成章节一和章节二...")
            prompt_1_2 = build_section1_2_prompt(
                request.project_info,
                request.inventory_data,
                request.context_data
            )

            response_1_2 = await llm_client.call(
                prompt=prompt_1_2,
                system_prompt=SYSTEM_PROMPT,
                temperature=temperature,
                model=model
            )

            result_1_2 = llm_client.parse_json_response(response_1_2)

            # 验证章节二数据
            validate_section2_data(result_1_2.get("section2", {}), request.inventory_data)

            logger.info("章节一和章节二生成成功")

            # 阶段二：生成章节三
            logger.info("生成章节三...")
            prompt_3 = build_section3_prompt(
                request.inventory_data,
                result_1_2.get("section1", {}),
                result_1_2.get("section2", {})
            )

            response_3 = await llm_client.call(
                prompt=prompt_3,
                system_prompt=SYSTEM_PROMPT,
                temperature=temperature,
                model=model
            )

            result_3 = llm_client.parse_json_response(response_3)

            # 验证章节三数据
            validate_section3_data(result_3.get("section3", {}))

            logger.info("章节三生成成功")

            # 阶段三：生成章节四
            logger.info("生成章节四...")
            prompt_4 = build_section4_prompt(
                result_1_2.get("section1", {}),
                result_1_2.get("section2", {}),
                result_3.get("section3", {}),
                request.value_data
            )

            response_4 = await llm_client.call(
                prompt=prompt_4,
                system_prompt=SYSTEM_PROMPT,
                temperature=temperature,
                model=model
            )

            result_4 = llm_client.parse_json_response(response_4)

            logger.info("章节四生成成功")

            # 构建完整响应
            generation_time = time.time() - start_time

            response_data = {
                "header": {
                    "project_name": request.project_info.project_name
                },
                "section1": result_1_2.get("section1", {}),
                "section2": result_1_2.get("section2", {}),
                "section3": result_3.get("section3", {}),
                "section4": result_4.get("section4", {}),
                "generation_time": round(generation_time, 2),
                "model_used": model
            }

            logger.info(
                f"报告生成完成 - 耗时: {generation_time:.2f}秒, "
                f"资产数: {len(request.inventory_data.identified_assets)}"
            )

            return response_data

        except Exception as e:
            logger.exception(f"报告生成失败: {str(e)}")
            raise LLMAPIException(
                f"报告生成失败: {str(e)}",
                error_detail=str(e),
                retryable="Rate limit" in str(e) or "timeout" in str(e).lower()
            )