356 lines
12 KiB
Python
356 lines
12 KiB
Python
"""
|
|
Arcádia BI Analysis Service - Análise de dados com Pandas
|
|
Serviço FastAPI que processa dados com pandas para fornecer estatísticas avançadas
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
from typing import Optional, List, Dict, Any
|
|
from datetime import datetime
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel, Field
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
app = FastAPI(
|
|
title="Arcádia BI Analysis Service",
|
|
description="Análise de dados com Pandas para o módulo de BI",
|
|
version="1.0.0"
|
|
)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
class AnalysisRequest(BaseModel):
|
|
data: List[Dict[str, Any]] = Field(..., description="Dados para análise em formato JSON")
|
|
question: Optional[str] = Field(None, description="Pergunta específica sobre os dados")
|
|
|
|
|
|
class ColumnStats(BaseModel):
|
|
name: str
|
|
dtype: str
|
|
count: int
|
|
null_count: int
|
|
unique_count: int
|
|
min_value: Optional[Any] = None
|
|
max_value: Optional[Any] = None
|
|
mean: Optional[float] = None
|
|
median: Optional[float] = None
|
|
std: Optional[float] = None
|
|
sum: Optional[float] = None
|
|
top_values: Optional[List[Dict[str, Any]]] = None
|
|
|
|
|
|
class AnalysisResult(BaseModel):
|
|
row_count: int
|
|
column_count: int
|
|
columns: List[ColumnStats]
|
|
numeric_summary: Dict[str, Any]
|
|
categorical_summary: Dict[str, Any]
|
|
correlations: Optional[Dict[str, Dict[str, float]]] = None
|
|
insights: List[str]
|
|
suggested_charts: List[Dict[str, Any]]
|
|
|
|
|
|
def analyze_column(df: pd.DataFrame, col: str) -> ColumnStats:
|
|
"""Analisa uma coluna e retorna estatísticas"""
|
|
series = df[col]
|
|
dtype = str(series.dtype)
|
|
|
|
stats = ColumnStats(
|
|
name=col,
|
|
dtype=dtype,
|
|
count=int(series.count()),
|
|
null_count=int(series.isna().sum()),
|
|
unique_count=int(series.nunique())
|
|
)
|
|
|
|
if pd.api.types.is_numeric_dtype(series):
|
|
stats.min_value = float(series.min()) if not pd.isna(series.min()) else None
|
|
stats.max_value = float(series.max()) if not pd.isna(series.max()) else None
|
|
stats.mean = float(series.mean()) if not pd.isna(series.mean()) else None
|
|
stats.median = float(series.median()) if not pd.isna(series.median()) else None
|
|
stats.std = float(series.std()) if not pd.isna(series.std()) else None
|
|
stats.sum = float(series.sum()) if not pd.isna(series.sum()) else None
|
|
else:
|
|
top = series.value_counts().head(5)
|
|
stats.top_values = [{"value": str(k), "count": int(v)} for k, v in top.items()]
|
|
if series.dtype == 'object':
|
|
try:
|
|
stats.min_value = str(series.min())
|
|
stats.max_value = str(series.max())
|
|
except:
|
|
pass
|
|
|
|
return stats
|
|
|
|
|
|
def generate_insights(df: pd.DataFrame) -> List[str]:
|
|
"""Gera insights automáticos sobre os dados"""
|
|
insights = []
|
|
|
|
insights.append(f"O dataset possui {len(df)} registros e {len(df.columns)} colunas.")
|
|
|
|
null_pct = (df.isna().sum().sum() / (len(df) * len(df.columns))) * 100
|
|
if null_pct > 0:
|
|
insights.append(f"Taxa de dados faltantes: {null_pct:.1f}%")
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
if numeric_cols:
|
|
insights.append(f"Colunas numéricas: {', '.join(numeric_cols)}")
|
|
|
|
for col in numeric_cols[:3]:
|
|
series = df[col].dropna()
|
|
if len(series) > 0:
|
|
cv = (series.std() / series.mean() * 100) if series.mean() != 0 else 0
|
|
if cv > 50:
|
|
insights.append(f"'{col}' tem alta variabilidade (CV: {cv:.1f}%)")
|
|
|
|
q1, q3 = series.quantile([0.25, 0.75])
|
|
iqr = q3 - q1
|
|
outliers = ((series < q1 - 1.5*iqr) | (series > q3 + 1.5*iqr)).sum()
|
|
if outliers > 0:
|
|
insights.append(f"'{col}' possui {outliers} outliers potenciais")
|
|
|
|
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
if cat_cols:
|
|
insights.append(f"Colunas categóricas: {', '.join(cat_cols)}")
|
|
|
|
for col in cat_cols[:2]:
|
|
unique_pct = df[col].nunique() / len(df) * 100
|
|
if unique_pct > 90:
|
|
insights.append(f"'{col}' parece ser um identificador único ({unique_pct:.0f}% valores únicos)")
|
|
|
|
if len(numeric_cols) >= 2:
|
|
corr_matrix = df[numeric_cols].corr()
|
|
for i, col1 in enumerate(numeric_cols):
|
|
for col2 in numeric_cols[i+1:]:
|
|
corr = corr_matrix.loc[col1, col2]
|
|
if abs(corr) > 0.7:
|
|
direction = "positiva" if corr > 0 else "negativa"
|
|
insights.append(f"Correlação {direction} forte entre '{col1}' e '{col2}' ({corr:.2f})")
|
|
|
|
return insights
|
|
|
|
|
|
def suggest_charts(df: pd.DataFrame) -> List[Dict[str, Any]]:
|
|
"""Sugere gráficos apropriados para os dados"""
|
|
suggestions = []
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
date_cols = [col for col in df.columns if 'date' in col.lower() or 'data' in col.lower()]
|
|
|
|
if cat_cols and numeric_cols:
|
|
suggestions.append({
|
|
"type": "bar",
|
|
"title": f"Distribuição de {numeric_cols[0]} por {cat_cols[0]}",
|
|
"xAxis": cat_cols[0],
|
|
"yAxis": numeric_cols[0],
|
|
"aggregation": "sum"
|
|
})
|
|
|
|
if date_cols and numeric_cols:
|
|
suggestions.append({
|
|
"type": "line",
|
|
"title": f"Evolução de {numeric_cols[0]} ao longo do tempo",
|
|
"xAxis": date_cols[0],
|
|
"yAxis": numeric_cols[0],
|
|
"aggregation": "sum"
|
|
})
|
|
|
|
if cat_cols and numeric_cols:
|
|
cat_col = cat_cols[0]
|
|
if df[cat_col].nunique() <= 8:
|
|
suggestions.append({
|
|
"type": "pie",
|
|
"title": f"Proporção de {numeric_cols[0]} por {cat_col}",
|
|
"xAxis": cat_col,
|
|
"yAxis": numeric_cols[0],
|
|
"aggregation": "sum"
|
|
})
|
|
|
|
if len(numeric_cols) >= 2:
|
|
suggestions.append({
|
|
"type": "scatter",
|
|
"title": f"Correlação entre {numeric_cols[0]} e {numeric_cols[1]}",
|
|
"xAxis": numeric_cols[0],
|
|
"yAxis": numeric_cols[1],
|
|
"aggregation": "none"
|
|
})
|
|
|
|
return suggestions
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
return {"status": "ok", "service": "bi-analysis", "pandas_version": pd.__version__}
|
|
|
|
|
|
@app.post("/analyze", response_model=AnalysisResult)
|
|
async def analyze_data(request: AnalysisRequest):
|
|
"""Analisa um conjunto de dados com pandas"""
|
|
try:
|
|
if not request.data or len(request.data) == 0:
|
|
raise HTTPException(status_code=400, detail="Dados vazios")
|
|
|
|
df = pd.DataFrame(request.data)
|
|
|
|
columns_stats = [analyze_column(df, col) for col in df.columns]
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
numeric_summary = {}
|
|
if numeric_cols:
|
|
desc = df[numeric_cols].describe()
|
|
numeric_summary = {
|
|
"columns": numeric_cols,
|
|
"statistics": desc.to_dict()
|
|
}
|
|
|
|
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
categorical_summary = {}
|
|
if cat_cols:
|
|
cat_stats = {}
|
|
for col in cat_cols[:5]:
|
|
value_counts = df[col].value_counts().head(10)
|
|
cat_stats[col] = {
|
|
"unique_count": int(df[col].nunique()),
|
|
"top_values": [{"value": str(k), "count": int(v)} for k, v in value_counts.items()]
|
|
}
|
|
categorical_summary = cat_stats
|
|
|
|
correlations = None
|
|
if len(numeric_cols) >= 2:
|
|
corr_matrix = df[numeric_cols].corr()
|
|
correlations = {}
|
|
for col in numeric_cols:
|
|
correlations[col] = {k: round(float(v), 3) for k, v in corr_matrix[col].items()}
|
|
|
|
insights = generate_insights(df)
|
|
suggested_charts = suggest_charts(df)
|
|
|
|
return AnalysisResult(
|
|
row_count=len(df),
|
|
column_count=len(df.columns),
|
|
columns=columns_stats,
|
|
numeric_summary=numeric_summary,
|
|
categorical_summary=categorical_summary,
|
|
correlations=correlations,
|
|
insights=insights,
|
|
suggested_charts=suggested_charts
|
|
)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Erro na análise: {str(e)}")
|
|
|
|
|
|
@app.post("/aggregate")
|
|
async def aggregate_data(
|
|
data: List[Dict[str, Any]],
|
|
group_by: str,
|
|
agg_column: str,
|
|
agg_function: str = "sum"
|
|
):
|
|
"""Agrega dados por uma coluna"""
|
|
try:
|
|
df = pd.DataFrame(data)
|
|
|
|
if group_by not in df.columns:
|
|
raise HTTPException(status_code=400, detail=f"Coluna '{group_by}' não encontrada")
|
|
if agg_column not in df.columns:
|
|
raise HTTPException(status_code=400, detail=f"Coluna '{agg_column}' não encontrada")
|
|
|
|
agg_funcs = {
|
|
"sum": "sum",
|
|
"mean": "mean",
|
|
"count": "count",
|
|
"min": "min",
|
|
"max": "max",
|
|
"median": "median",
|
|
"std": "std"
|
|
}
|
|
|
|
if agg_function not in agg_funcs:
|
|
raise HTTPException(status_code=400, detail=f"Função de agregação inválida. Use: {list(agg_funcs.keys())}")
|
|
|
|
result = df.groupby(group_by)[agg_column].agg(agg_funcs[agg_function]).reset_index()
|
|
result.columns = [group_by, agg_column]
|
|
|
|
return {
|
|
"data": result.to_dict(orient="records"),
|
|
"aggregation": {
|
|
"group_by": group_by,
|
|
"column": agg_column,
|
|
"function": agg_function
|
|
}
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Erro na agregação: {str(e)}")
|
|
|
|
|
|
@app.post("/filter")
|
|
async def filter_data(
|
|
data: List[Dict[str, Any]],
|
|
column: str,
|
|
operator: str,
|
|
value: Any
|
|
):
|
|
"""Filtra dados por uma condição"""
|
|
try:
|
|
df = pd.DataFrame(data)
|
|
|
|
if column not in df.columns:
|
|
raise HTTPException(status_code=400, detail=f"Coluna '{column}' não encontrada")
|
|
|
|
ops = {
|
|
"eq": lambda x, v: x == v,
|
|
"ne": lambda x, v: x != v,
|
|
"gt": lambda x, v: x > v,
|
|
"gte": lambda x, v: x >= v,
|
|
"lt": lambda x, v: x < v,
|
|
"lte": lambda x, v: x <= v,
|
|
"contains": lambda x, v: x.str.contains(str(v), case=False, na=False),
|
|
"startswith": lambda x, v: x.str.startswith(str(v), na=False),
|
|
"endswith": lambda x, v: x.str.endswith(str(v), na=False)
|
|
}
|
|
|
|
if operator not in ops:
|
|
raise HTTPException(status_code=400, detail=f"Operador inválido. Use: {list(ops.keys())}")
|
|
|
|
mask = ops[operator](df[column], value)
|
|
result = df[mask]
|
|
|
|
return {
|
|
"data": result.to_dict(orient="records"),
|
|
"filter": {
|
|
"column": column,
|
|
"operator": operator,
|
|
"value": value
|
|
},
|
|
"original_count": len(df),
|
|
"filtered_count": len(result)
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Erro no filtro: {str(e)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
port = int(os.environ.get("BI_ANALYSIS_PORT", 8003))
|
|
uvicorn.run(app, host="0.0.0.0", port=port)
|