arcadia-suite-sv/python-service/services/capture.py

177 lines
6.5 KiB
Python

import base64
import os
import json
from datetime import datetime
from typing import Dict, Any
import httpx
from bs4 import BeautifulSoup
async def capture_url(url: str, wait_time: int = 3000) -> Dict[str, Any]:
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7"
}
async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
html = response.text
soup = BeautifulSoup(html, 'lxml')
for tag in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside']):
tag.decompose()
title = soup.title.string.strip() if soup.title and soup.title.string else url
text_parts = []
for text in soup.stripped_strings:
if len(text) > 2:
text_parts.append(text)
text_content = '\n'.join(text_parts)
headings = []
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
for h in soup.find_all(tag):
if h.get_text(strip=True):
headings.append({
"level": tag.upper(),
"text": h.get_text(strip=True)[:200]
})
links = []
for a in soup.find_all('a', href=True):
text = a.get_text(strip=True)
if text and len(text) > 2:
href = a['href']
if href.startswith('/'):
from urllib.parse import urljoin
href = urljoin(url, href)
links.append({
"text": text[:100],
"href": href
})
if len(links) >= 50:
break
images = []
for img in soup.find_all('img', src=True):
src = img['src']
if src.startswith('/'):
from urllib.parse import urljoin
src = urljoin(url, src)
images.append({
"src": src,
"alt": img.get('alt', '')[:100]
})
if len(images) >= 20:
break
meta = {}
for m in soup.find_all('meta'):
name = m.get('name') or m.get('property')
content = m.get('content')
if name and content:
meta[name] = content[:500]
return {
"success": True,
"url": url,
"title": title,
"text_content": text_content[:50000],
"headings": headings[:30],
"links": links,
"images": images,
"meta": meta,
"screenshot_base64": None,
"captured_at": datetime.now().isoformat(),
"word_count": len(text_content.split())
}
except httpx.HTTPStatusError as e:
return {
"success": False,
"url": url,
"error": f"HTTP error: {e.response.status_code}",
"captured_at": datetime.now().isoformat()
}
except httpx.RequestError as e:
return {
"success": False,
"url": url,
"error": f"Request failed: {str(e)}",
"captured_at": datetime.now().isoformat()
}
except Exception as e:
return {
"success": False,
"url": url,
"error": str(e),
"captured_at": datetime.now().isoformat()
}
async def extract_requirements(content: str, context: str = "") -> Dict[str, Any]:
openai_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("AI_INTEGRATIONS_OPENAI_API_KEY")
if not openai_key:
return {"success": False, "error": "OpenAI API key not configured"}
prompt = f"""Analise o seguinte conteúdo de uma página web e extraia informações estruturadas:
CONTEÚDO:
{content[:15000]}
CONTEXTO ADICIONAL:
{context}
Por favor, extraia:
1. RESUMO: Um resumo conciso do conteúdo (2-3 parágrafos)
2. FUNCIONALIDADES: Liste as principais funcionalidades ou recursos mencionados
3. REQUISITOS: Se houver menção a requisitos de sistema, liste-os
4. ENTIDADES: Identifique entidades principais (pessoas, empresas, produtos, tecnologias)
5. TERMOS_CHAVE: Liste os 10 termos ou conceitos mais importantes
6. INSIGHTS: Observações ou insights relevantes para análise de negócios
Responda em formato JSON válido."""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {openai_key}",
"Content-Type": "application/json"
},
json={
"model": "gpt-4o-mini",
"messages": [
{"role": "system", "content": "Você é um analista de sistemas especializado em extrair requisitos e informações estruturadas de conteúdo web."},
{"role": "user", "content": prompt}
],
"temperature": 0.3,
"max_tokens": 2000
},
timeout=60.0
)
if response.status_code == 200:
data = response.json()
content_response = data["choices"][0]["message"]["content"]
try:
if "```json" in content_response:
content_response = content_response.split("```json")[1].split("```")[0]
elif "```" in content_response:
content_response = content_response.split("```")[1].split("```")[0]
extracted = json.loads(content_response.strip())
return {"success": True, "extraction": extracted}
except json.JSONDecodeError:
return {"success": True, "extraction": {"raw_response": content_response}}
else:
return {"success": False, "error": f"OpenAI API error: {response.status_code}"}
except Exception as e:
return {"success": False, "error": str(e)}