Json-Python-Server/app/api/routes/analysis_v2.py

"""v2 analysis route: analyze CSV from OSS/URL.

Design goals:
- Keep v1 endpoints unchanged
- Provide the same response shape as v1, but with URL as input
- Avoid leaking server local paths
"""

import gc
import logging
import os
import shutil
from datetime import datetime
from typing import Any, Dict, Optional

import psutil
from fastapi import APIRouter, BackgroundTasks, HTTPException, status
from pydantic import BaseModel

from app.core.config import settings
from app.services.analysis import TimeSeriesAnalysisSystem
from app.services.oss_csv_source import UrlValidationError, download_csv_to_tempfile

logger = logging.getLogger(__name__)
router = APIRouter()


class AnalysisV2Request(BaseModel):
    """v2 分析请求模型（输入为 OSS/URL）"""

    oss_url: str
    task_description: str = "时间序列数据分析"
    data_background: Dict[str, Any] = {}
    language: str = "zh"
    generate_plots: bool = False
    source_name: Optional[str] = None


@router.get("/available_methods", summary="获取可用的分析方法（v2）")
async def get_available_methods_v2() -> dict:
    """v2 版本：返回与 v1 相同的可用分析方法列表。"""

    return {
        "success": True,
        "methods": {
            "statistical_overview": {"name": "统计概览", "description": "生成数据的基本统计信息和分布图表"},
            "time_series_analysis": {"name": "时间序列分析", "description": "分析变量随时间变化的趋势和模式"},
            "acf_pacf_analysis": {"name": "自相关分析", "description": "生成自相关和偏自相关函数图"},
            "stationarity_tests": {"name": "平稳性检验", "description": "执行ADF、KPSS等平稳性检验"},
            "normality_tests": {"name": "正态性检验", "description": "执行Shapiro-Wilk、Jarque-Bera正态性检验"},
            "seasonal_decomposition": {"name": "季节性分解", "description": "分解时间序列的趋势、季节和残差成分"},
            "spectral_analysis": {"name": "频谱分析", "description": "分析时间序列的频域特征"},
            "correlation_analysis": {"name": "相关性分析", "description": "计算变量间的相关性并生成热力图"},
            "pca_scree_plot": {"name": "PCA碎石图", "description": "显示主成分分析的解释方差"},
            "pca_analysis": {"name": "主成分分析", "description": "降维分析，识别数据的主要变化方向"},
            "feature_importance": {"name": "特征重要性", "description": "分析各变量对目标预测的重要性"},
            "clustering_analysis": {"name": "聚类分析", "description": "将数据点分组为具有相似特征的簇"},
            "factor_analysis": {"name": "因子分析", "description": "识别潜在的因子结构"},
            "cointegration_test": {"name": "协整检验", "description": "检验时间序列变量间的长期均衡关系"},
            "var_analysis": {"name": "向量自回归", "description": "多变量时间序列建模和预测"},
        },
    }


def check_memory():
    """检查内存使用"""

    process = psutil.Process(os.getpid())
    memory_mb = process.memory_info().rss / 1024 / 1024
    logger.info(f"当前内存使用: {memory_mb:.2f} MB")

    if memory_mb > settings.MAX_MEMORY_MB:
        logger.warning(f"内存使用超过阈值 ({settings.MAX_MEMORY_MB} MB)，执行垃圾回收")
        gc.collect()


@router.post("/analyze", summary="执行完整分析（v2：从 OSS URL 读取 CSV）")
async def analyze_data_v2(request: AnalysisV2Request, background_tasks: BackgroundTasks) -> dict:
    """Analyze CSV from an OSS/URL, returning the same structure as v1."""

    downloaded = None

    try:
        logger.info("=" * 60)
        logger.info("开始分析 (v2)")
        logger.info(f"URL host: {request.oss_url}")
        logger.info(f"任务: {request.task_description}")
        logger.info(f"语言: {request.language}")
        logger.info("=" * 60)

        check_memory()

        # 语言处理：支持 zh/en，其他值回退为 zh
        lang_key = request.language if request.language in {"zh", "en"} else "zh"

        # charts 模式下强制不生成图片，即使请求传了 generate_plots=true
        generate_plots = False
        if request.generate_plots:
            logger.info("generate_plots requested true, forcing false to skip image generation")

        # 下载到临时文件
        try:
            downloaded = download_csv_to_tempfile(request.oss_url, suffix=".csv")
        except UrlValidationError as e:
            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))

        filename_for_meta = request.source_name or downloaded.source_name

        # 创建分析器实例（复用原有分析系统）
        analyzer = TimeSeriesAnalysisSystem(
            downloaded.local_path,
            request.task_description,
            data_background=request.data_background,
            language=lang_key,
            generate_plots=generate_plots,
        )

        # 运行分析
        logger.info("执行分析...")
        results, log_entries = analyzer.run_analysis()

        if results is None:
            logger.error("分析失败")
            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="分析失败")

        # 准备返回数据（尽量与 v1 保持一致）
        response_data = {
            "success": True,
            "meta": {
                "filename": filename_for_meta,
                "task_description": request.task_description,
                "language": lang_key,
                "generate_plots": generate_plots,
                "created_at": datetime.now().isoformat(),
                "version": "v2",
                "source": {
                    "type": "oss_url",
                    "host": downloaded.source_host,
                    "name": filename_for_meta,
                    "etag": downloaded.etag,
                    "last_modified": downloaded.last_modified,
                },
            },
            "analysis": {
                lang_key: {
                    "pdf_filename": None,
                    "ppt_filename": None,
                    "data_description": results.get("data_description"),
                    "preprocessing_steps": results.get("preprocessing_steps", []),
                    "api_analysis": results.get("api_analysis", {}),
                    "steps": results.get("steps", []),
                    "charts": results.get("charts", {}),
                }
            },
            "images": {},
            "log": log_entries[-20:] if log_entries else [],
            "original_image": None,
        }

        # 兼容旧前端：始终提供 analysis.zh
        if lang_key != "zh":
            response_data["analysis"]["zh"] = response_data["analysis"][lang_key]

        analysis_bucket = response_data["analysis"][lang_key]

        # 确保不暴露本地路径，steps chart 引用即可
        steps = analysis_bucket.get("steps")
        if isinstance(steps, list):
            for step in steps:
                if isinstance(step, dict) and "image_path" in step:
                    step.pop("image_path", None)

        # images 保留为空兼容旧前端
        response_data["images"] = {}

        logger.info("分析完成 (v2)")
        return response_data

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"分析异常 (v2): {str(e)}", exc_info=True)
        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))

    finally:
        # 清理临时文件
        if downloaded is not None:
            try:
                os.unlink(downloaded.local_path)
            except Exception:
                pass