From e10f623eb9ee2cfabafadc93573178fe39273fda Mon Sep 17 00:00:00 2001
From: JonasMelo21 <jonashonorato4@gmail.com>
Date: Mon, 6 Apr 2026 21:19:34 -0300
Subject: [PATCH 01/10] =?UTF-8?q?feat:=20adiciona=20avalia=C3=A7=C3=A3o=20?=
 =?UTF-8?q?com=20Spider=20Dataset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Novo módulo src/spider/ com data_loader, query_executor, metrics, csv_reporter
- Script scripts/test_spider_eval.py para avaliar agente contra 1.034 exemplos
- Rastreia cada tentativa individualmente em CSV (1 linha por tentativa)
- Calcula similarity score, resultado match, feedback do crítico
- Suporta filtrar por db_id, seed, sample-size, max-attempts
- Gera resumo com estatísticas por pergunta
- Atualiza documentação: DESENVOLVIMENTO.md e README.md
---
 DESENVOLVIMENTO.md             | 138 +++++++++++++++-
 README.md                      |  89 +++++++++-
 projeto-raia/bin/Activate.ps1  | 247 ++++++++++++++++++++++++++++
 projeto-raia/bin/activate      |  70 ++++++++
 projeto-raia/bin/activate.csh  |  27 +++
 projeto-raia/bin/activate.fish |  69 ++++++++
 projeto-raia/bin/distro        |   8 +
 projeto-raia/bin/dotenv        |   8 +
 projeto-raia/bin/filetype      |   8 +
 projeto-raia/bin/httpx         |   8 +
 projeto-raia/bin/jsondiff      |  41 +++++
 projeto-raia/bin/jsonpatch     | 107 ++++++++++++
 projeto-raia/bin/jsonpointer   |  66 ++++++++
 projeto-raia/bin/normalizer    |   8 +
 projeto-raia/bin/pip           |   8 +
 projeto-raia/bin/pip3          |   8 +
 projeto-raia/bin/pip3.12       |   8 +
 projeto-raia/bin/py.test       |   8 +
 projeto-raia/bin/pygmentize    |   8 +
 projeto-raia/bin/pytest        |   8 +
 projeto-raia/bin/python        |   1 +
 projeto-raia/bin/python3       |   1 +
 projeto-raia/bin/python3.12    |   1 +
 projeto-raia/bin/websockets    |   8 +
 projeto-raia/lib64             |   1 +
 projeto-raia/pyvenv.cfg        |   5 +
 reports/meu_teste.csv          |   4 +
 scripts/test_spider_eval.py    | 291 +++++++++++++++++++++++++++++++++
 src/graph.py                   |  15 ++
 src/spider/__init__.py         |   9 +
 src/spider/csv_reporter.py     | 149 +++++++++++++++++
 src/spider/data_loader.py      |  96 +++++++++++
 src/spider/metrics.py          | 161 ++++++++++++++++++
 src/spider/query_executor.py   | 125 ++++++++++++++
 34 files changed, 1806 insertions(+), 3 deletions(-)
 create mode 100644 projeto-raia/bin/Activate.ps1
 create mode 100644 projeto-raia/bin/activate
 create mode 100644 projeto-raia/bin/activate.csh
 create mode 100644 projeto-raia/bin/activate.fish
 create mode 100755 projeto-raia/bin/distro
 create mode 100755 projeto-raia/bin/dotenv
 create mode 100755 projeto-raia/bin/filetype
 create mode 100755 projeto-raia/bin/httpx
 create mode 100755 projeto-raia/bin/jsondiff
 create mode 100755 projeto-raia/bin/jsonpatch
 create mode 100755 projeto-raia/bin/jsonpointer
 create mode 100755 projeto-raia/bin/normalizer
 create mode 100755 projeto-raia/bin/pip
 create mode 100755 projeto-raia/bin/pip3
 create mode 100755 projeto-raia/bin/pip3.12
 create mode 100755 projeto-raia/bin/py.test
 create mode 100755 projeto-raia/bin/pygmentize
 create mode 100755 projeto-raia/bin/pytest
 create mode 120000 projeto-raia/bin/python
 create mode 120000 projeto-raia/bin/python3
 create mode 120000 projeto-raia/bin/python3.12
 create mode 100755 projeto-raia/bin/websockets
 create mode 120000 projeto-raia/lib64
 create mode 100644 projeto-raia/pyvenv.cfg
 create mode 100644 reports/meu_teste.csv
 create mode 100644 scripts/test_spider_eval.py
 create mode 100644 src/spider/__init__.py
 create mode 100644 src/spider/csv_reporter.py
 create mode 100644 src/spider/data_loader.py
 create mode 100644 src/spider/metrics.py
 create mode 100644 src/spider/query_executor.py

diff --git a/DESENVOLVIMENTO.md b/DESENVOLVIMENTO.md
index 797653f..17e5700 100644
--- a/DESENVOLVIMENTO.md
+++ b/DESENVOLVIMENTO.md
@@ -75,6 +75,129 @@ Se um teste falha:
 - Falha na camada 2 → o nó específico que falhou está com problema
 - Camada 2 passa mas camada 3 falha → problema nos roteadores ou na conexão entre nós
 
+## Testes do módulo Spider
+
+```bash
+# Testar data loader
+python -c "from src.spider.data_loader import load_spider_dev_examples; ex = load_spider_dev_examples('data/spider_data/spider_data'); print(f'Carregado: {len(ex)} exemplos')"
+
+# Testar query executor
+python -c "from src.spider.query_executor import SpiderQueryExecutor; ex = SpiderQueryExecutor(); print(f'DBs encontrados: {len(ex.list_available_dbs())}')"
+
+# Testar metrics
+python -c "from src.spider.metrics import sql_similarity_score; print(f'Score: {sql_similarity_score(\\\"SELECT * FROM a\\\", \\\"select * from a\\\")}')"
+```
+
+## Arquitetura do módulo Spider
+
+O módulo reutiliza o grafo existente para avaliar cada pergunta do Spider dataset:
+
+### Fluxo por pergunta
+
+```
+1. Load: data_loader carrega dev.json (1.034 exemplos)
+   └─> Cada exemplo tem: {question, query (ouro), db_id}
+
+2. Query Ouro: SpiderQueryExecutor executa query original no banco
+   └─> Captura resultado esperado (baseline)
+
+3. Para cada tentativa (até 3x):
+   a) Estado Inicial: pergunta + schema do banco
+   b) Grafo Executa: planejador → schema → code_agent → sandbox → crítico
+   c) Stream Acumula: full_estado coleta mudanças de cada nó
+   d) Crítico Decide:
+      - Se aprovado: extrai query_agente, compara com ouro
+      - Se reprovado: volta ao planejador (retry)
+   e) CSV Registra: 1 linha por tentativa com todos os dados
+
+4. Resumo: CSVReporter calcula estatísticas (taxa aprovação, similarity média, etc)
+```
+
+### Pontos técnicos importantes
+
+- **State Accumulation**: O stream() retorna deltas por nó, não estado total. Script mantém `full_estado = estado_inicial.copy()` e atualiza com cada output
+- **Recursion Limit**: Config de `recursion_limit=30` permite planejador iterar até 3 vezes sem erro
+- **Database Path**: Spider stores databases em `data/spider_data/spider_data/database/{db_id}/{db_id}.sqlite`
+- **Read-only Mode**: SQLite conecta com `?mode=ro&uri=true` para evitar escrita
+- **Normalization**: SQL é normalizado (UPPER, sem comentários, sem whitespace) antes de comparar similarity
+
+### Customizando a avaliação
+
+Para testar com diferentes LLMs ou ajustar prompts:
+
+1. Editar `src/nodes/planner.py`, `code_agent.py`, `critic.py` conforme necessário
+2. Script USA o mesmo grafo (`src/graph.py`), então mudanças se refletem automaticamente
+3. Para testar especificamente um nó: reutilize o teste em `tests/test_nodes.py`
+
+## Avaliação com Spider Dataset
+
+O projeto inclui um sistema completo de avaliação contra o dataset Spider, que rastreia cada tentativa do agente quando o crítico reprova.
+
+### Setup
+
+```bash
+# Baixar o dataset Spider
+# Extrair em data/spider_data/
+# Estrutura esperada:
+# data/spider_data/spider_data/
+#   ├── dev.json
+#   ├── database/
+#   │   ├── concert_singer/
+#   │   ├── pets_1/
+#   │   └── ...
+```
+
+### Executar avaliação
+
+```bash
+# Teste simples com 10 perguntas
+python scripts/test_spider_eval.py
+
+# Com customizações
+python scripts/test_spider_eval.py \
+  --sample-size 50 \
+  --seed 42 \
+  --db-filter concert_singer \
+  --output reports/spider_eval.csv \
+  --max-attempts 5
+
+# Parâmetros:
+# --sample-size N       : Quantas perguntas testar (default: 10)
+# --seed SEED           : Seed para reproducibilidade (default: 42)
+# --db-filter DB_ID     : Filtrar por um banco (ex: concert_singer)
+# --output PATH         : Caminho CSV (default: reports/spider_eval_TIMESTAMP.csv)
+# --max-attempts N      : Máx tentativas por pergunta (default: 3)
+```
+
+### Entender os resultados
+
+O script gera um CSV com **uma linha por tentativa**:
+
+| Coluna | Significado |
+|--------|-------------|
+| tentativa_numero | 1ª, 2ª, 3ª tentativa desta pergunta |
+| veredito_critico | aprovado/reprovado/erro |
+| feedback_critico | Motivo da reprovação (para debug) |
+| similarity_score_sql | 0-1, similaridade com query ouro |
+| resultado_exato_match | True se resultado foi idêntico |
+| tempo_agente_ms | Quanto tempo levou aquela tentativa |
+
+Resumo final:
+```
+Total de perguntas: 100
+Perguntas aprovadas: 82
+Taxa de aprovação: 82.0%
+Taxa de sucesso na 1ª tentativa: 75.0%
+Tentativas médias por pergunta: 1.23
+Similarity score médio: 0.945
+Tempo médio por tentativa: 12500 ms
+```
+
+**Análise**:
+- Taxa de 1ª tentativa baixa? → Agente gerando queries incorretas inicialmente
+- Similarity alto mas veredito reprovado? → Queries diferentes semanticamente
+- Muitas tentativas? → Crítico ou agente não aprendendo com feedback
+
 ## Estrutura de arquivos
 
 ```
@@ -83,7 +206,11 @@ TextToInsight/
 ├── .env                           # GOOGLE_API_KEY (não commitar)
 ├── requirements.txt
 ├── data/
-│   └── olist_relational.db        # Banco SQLite
+│   ├── olist_relacional.db        # Banco SQLite (dados de teste)
+│   └── spider_data/               # Dataset Spider (opcional)
+│       └── spider_data/
+│           ├── dev.json
+│           └── database/
 ├── src/
 │   ├── state.py                   # EstadoTextToInsight (TypedDict)
 │   ├── graph.py                   # Grafo LangGraph
@@ -95,8 +222,17 @@ TextToInsight/
 │   │   │   └── code_sql.py        # Validação + execução SQL
 │   │   ├── sandbox.py             # Executor SQL (banco real)
 │   │   └── critic.py              # Avaliador (Gemini)
+│   ├── spider/                    # **Módulo de Avaliação Spider**
+│   │   ├── data_loader.py         # Carregar dev.json
+│   │   ├── query_executor.py      # Executar queries
+│   │   ├── metrics.py             # Similarity score, comparações
+│   │   └── csv_reporter.py        # Gerar CSV por tentativa
 │   └── routers/
 │       └── edges.py               # Roteadores condicionais
+├── scripts/
+│   └── test_spider_eval.py        # Script de avaliação Spider
+├── reports/
+│   └── spider_eval_*.csv          # Resultados das avaliações
 └── tests/
     ├── test_componentes.py        # Sem API
     ├── test_nodes.py              # Com API, nó a nó
diff --git a/README.md b/README.md
index 00ab3a2..7de6682 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,83 @@ pytest tests/test_nodes.py -v -s
 pytest tests/test_integracao.py -v -s
 ```
 
+## Avaliação com Spider Dataset
+
+Sistema completo de avaliação que testa o agente contra o dataset Spider, rastreando cada tentativa (quando o crítico reprova e volta ao planejador).
+
+**Nota**: Spider Dataset é opcional. Use apenas se quiser avaliar contra 1.034 exemplos reais.
+
+### Setup
+
+1. [Baixar Spider Dataset](https://drive.google.com/uc?export=download&id=1iYkIGr7MwuOvBkRkj4RKs6Ff3NMa7NMG)
+2. Descompactar em `data/spider_data/`
+3. Verificar estrutura:
+   ```
+   data/spider_data/spider_data/
+   ├── dev.json
+   ├── database/
+   │   ├── concert_singer/
+   │   ├── pets_1/
+   │   └── ... (20 bancos no total)
+   ```
+
+### Como usar
+
+```bash
+# Teste simples com 10 perguntas (padrão)
+python scripts/test_spider_eval.py
+
+# Com customizações
+python scripts/test_spider_eval.py \
+  --sample-size 50 \
+  --seed 42 \
+  --db-filter concert_singer \
+  --output reports/my_eval.csv
+
+# Parâmetros
+# --sample-size N       : Quantas perguntas testar (default: 10)
+# --seed SEED           : Seed para reproducibilidade (default: 42)
+# --db-filter DB_ID     : Filtrar por um banco específico
+# --output PATH         : Caminho para salvar CSV (default: reports/spider_eval_TIMESTAMP.csv)
+# --max-attempts N      : Máximo de tentativas por pergunta (default: 3)
+```
+
+### Saída
+
+O script gera um CSV com 12 colunas, **uma linha por tentativa**:
+
+```
+id_exemplo | tentativa_numero | db_id | pergunta_usuario | query_ouro_spider | query_agente_tentativa | veredito_critico | feedback_critico_recebido | similarity_score_sql | resultado_exato_match | ...
+1          | 1                | concert_singer | How many singers? | SELECT ... | SELECT ... | reprovado | Table "singers" does not... | 0.85   | -
+1          | 2                | concert_singer | How many singers? | SELECT ... | SELECT ... | aprovado  | Aprovado | 1.00 | True
+```
+
+E um resumo final:
+
+```
+Total de perguntas: 10
+Total de tentativas: 12
+Perguntas aprovadas: 8
+Taxa de aprovação: 80%
+Taxa de sucesso na 1ª tentativa: 60%
+Tentativas médias por pergunta: 1.2
+Similarity score médio: 0.92
+```
+
+### Interpretação
+
+- **id_exemplo**: ID da pergunta (mesmo para todas tentativas dela)
+- **tentativa_numero**: 1ª, 2ª, 3ª tentativa...
+- **veredito_critico**: Aprovado/Reprovado/Erro naquela tentativa
+- **feedback_critico**: Motivo da reprovação (útil para debug)
+- **similarity_score_sql**: 0-1, quanto a query do agente se parece com a ouro
+- **resultado_exato_match**: Se o resultado executado foi exatamente igual
+
+**Análise típica**:
+- Taxa de 1ª tentativa baixa? → Agente está gerando queries incorretas inicialmente
+- Similarity score alto mas veredito reprovado? → Queries sintaticamente parecidas mas semanticamente diferentes
+- Muitas tentativas? → Crítico não está dando feedback útil ou agente não aprende com feedback
+
 ## Estrutura
 
 ```
@@ -84,8 +161,16 @@ TextToInsight/
 │   │   │   └── code_sql.py              # Validação e execução de SQL
 │   │   ├── sandbox.py                   # Executor de SQL (banco real)
 │   │   └── critic.py                    # Avaliador de qualidade (LLM)
-│   └── routers/
-│       └── edges.py                     # Roteadores condicionais
+│   ├── routers/
+│   │   └── edges.py                     # Roteadores condicionais
+│   └── spider/                          # **NOVO**: Módulo Spider
+│       ├── __init__.py
+│       ├── data_loader.py               # Carregar exemplos de dev.json
+│       ├── query_executor.py            # Executar queries no banco
+│       ├── metrics.py                   # Similarity score, comparações
+│       └── csv_reporter.py              # Salvar CSV por tentativa
+├── scripts/
+│   └── test_spider_eval.py              # **NOVO**: Script de avaliação
 └── tests/
     ├── test_componentes.py              # Testes sem API
     ├── test_nodes.py                    # Testes por nó com API
diff --git a/projeto-raia/bin/Activate.ps1 b/projeto-raia/bin/Activate.ps1
new file mode 100644
index 0000000..b49d77b
--- /dev/null
+++ b/projeto-raia/bin/Activate.ps1
@@ -0,0 +1,247 @@
+<#
+.Synopsis
+Activate a Python virtual environment for the current PowerShell session.
+
+.Description
+Pushes the python executable for a virtual environment to the front of the
+$Env:PATH environment variable and sets the prompt to signify that you are
+in a Python virtual environment. Makes use of the command line switches as
+well as the `pyvenv.cfg` file values present in the virtual environment.
+
+.Parameter VenvDir
+Path to the directory that contains the virtual environment to activate. The
+default value for this is the parent of the directory that the Activate.ps1
+script is located within.
+
+.Parameter Prompt
+The prompt prefix to display when this virtual environment is activated. By
+default, this prompt is the name of the virtual environment folder (VenvDir)
+surrounded by parentheses and followed by a single space (ie. '(.venv) ').
+
+.Example
+Activate.ps1
+Activates the Python virtual environment that contains the Activate.ps1 script.
+
+.Example
+Activate.ps1 -Verbose
+Activates the Python virtual environment that contains the Activate.ps1 script,
+and shows extra information about the activation as it executes.
+
+.Example
+Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
+Activates the Python virtual environment located in the specified location.
+
+.Example
+Activate.ps1 -Prompt "MyPython"
+Activates the Python virtual environment that contains the Activate.ps1 script,
+and prefixes the current prompt with the specified string (surrounded in
+parentheses) while the virtual environment is active.
+
+.Notes
+On Windows, it may be required to enable this Activate.ps1 script by setting the
+execution policy for the user. You can do this by issuing the following PowerShell
+command:
+
+PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+
+For more information on Execution Policies: 
+https://go.microsoft.com/fwlink/?LinkID=135170
+
+#>
+Param(
+    [Parameter(Mandatory = $false)]
+    [String]
+    $VenvDir,
+    [Parameter(Mandatory = $false)]
+    [String]
+    $Prompt
+)
+
+<# Function declarations --------------------------------------------------- #>
+
+<#
+.Synopsis
+Remove all shell session elements added by the Activate script, including the
+addition of the virtual environment's Python executable from the beginning of
+the PATH variable.
+
+.Parameter NonDestructive
+If present, do not remove this function from the global namespace for the
+session.
+
+#>
+function global:deactivate ([switch]$NonDestructive) {
+    # Revert to original values
+
+    # The prior prompt:
+    if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
+        Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
+        Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
+    }
+
+    # The prior PYTHONHOME:
+    if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
+        Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
+        Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
+    }
+
+    # The prior PATH:
+    if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
+        Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
+        Remove-Item -Path Env:_OLD_VIRTUAL_PATH
+    }
+
+    # Just remove the VIRTUAL_ENV altogether:
+    if (Test-Path -Path Env:VIRTUAL_ENV) {
+        Remove-Item -Path env:VIRTUAL_ENV
+    }
+
+    # Just remove VIRTUAL_ENV_PROMPT altogether.
+    if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
+        Remove-Item -Path env:VIRTUAL_ENV_PROMPT
+    }
+
+    # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
+    if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
+        Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
+    }
+
+    # Leave deactivate function in the global namespace if requested:
+    if (-not $NonDestructive) {
+        Remove-Item -Path function:deactivate
+    }
+}
+
+<#
+.Description
+Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
+given folder, and returns them in a map.
+
+For each line in the pyvenv.cfg file, if that line can be parsed into exactly
+two strings separated by `=` (with any amount of whitespace surrounding the =)
+then it is considered a `key = value` line. The left hand string is the key,
+the right hand is the value.
+
+If the value starts with a `'` or a `"` then the first and last character is
+stripped from the value before being captured.
+
+.Parameter ConfigDir
+Path to the directory that contains the `pyvenv.cfg` file.
+#>
+function Get-PyVenvConfig(
+    [String]
+    $ConfigDir
+) {
+    Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
+
+    # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
+    $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
+
+    # An empty map will be returned if no config file is found.
+    $pyvenvConfig = @{ }
+
+    if ($pyvenvConfigPath) {
+
+        Write-Verbose "File exists, parse `key = value` lines"
+        $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
+
+        $pyvenvConfigContent | ForEach-Object {
+            $keyval = $PSItem -split "\s*=\s*", 2
+            if ($keyval[0] -and $keyval[1]) {
+                $val = $keyval[1]
+
+                # Remove extraneous quotations around a string value.
+                if ("'""".Contains($val.Substring(0, 1))) {
+                    $val = $val.Substring(1, $val.Length - 2)
+                }
+
+                $pyvenvConfig[$keyval[0]] = $val
+                Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
+            }
+        }
+    }
+    return $pyvenvConfig
+}
+
+
+<# Begin Activate script --------------------------------------------------- #>
+
+# Determine the containing directory of this script
+$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
+$VenvExecDir = Get-Item -Path $VenvExecPath
+
+Write-Verbose "Activation script is located in path: '$VenvExecPath'"
+Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
+Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
+
+# Set values required in priority: CmdLine, ConfigFile, Default
+# First, get the location of the virtual environment, it might not be
+# VenvExecDir if specified on the command line.
+if ($VenvDir) {
+    Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
+}
+else {
+    Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
+    $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
+    Write-Verbose "VenvDir=$VenvDir"
+}
+
+# Next, read the `pyvenv.cfg` file to determine any required value such
+# as `prompt`.
+$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
+
+# Next, set the prompt from the command line, or the config file, or
+# just use the name of the virtual environment folder.
+if ($Prompt) {
+    Write-Verbose "Prompt specified as argument, using '$Prompt'"
+}
+else {
+    Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
+    if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
+        Write-Verbose "  Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
+        $Prompt = $pyvenvCfg['prompt'];
+    }
+    else {
+        Write-Verbose "  Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
+        Write-Verbose "  Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
+        $Prompt = Split-Path -Path $venvDir -Leaf
+    }
+}
+
+Write-Verbose "Prompt = '$Prompt'"
+Write-Verbose "VenvDir='$VenvDir'"
+
+# Deactivate any currently active virtual environment, but leave the
+# deactivate function in place.
+deactivate -nondestructive
+
+# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
+# that there is an activated venv.
+$env:VIRTUAL_ENV = $VenvDir
+
+if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
+
+    Write-Verbose "Setting prompt to '$Prompt'"
+
+    # Set the prompt to include the env name
+    # Make sure _OLD_VIRTUAL_PROMPT is global
+    function global:_OLD_VIRTUAL_PROMPT { "" }
+    Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
+    New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
+
+    function global:prompt {
+        Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
+        _OLD_VIRTUAL_PROMPT
+    }
+    $env:VIRTUAL_ENV_PROMPT = $Prompt
+}
+
+# Clear PYTHONHOME
+if (Test-Path -Path Env:PYTHONHOME) {
+    Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
+    Remove-Item -Path Env:PYTHONHOME
+}
+
+# Add the venv to the PATH
+Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
+$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
diff --git a/projeto-raia/bin/activate b/projeto-raia/bin/activate
new file mode 100644
index 0000000..503c906
--- /dev/null
+++ b/projeto-raia/bin/activate
@@ -0,0 +1,70 @@
+# This file must be used with "source bin/activate" *from bash*
+# You cannot run it directly
+
+deactivate () {
+    # reset old environment variables
+    if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
+        PATH="${_OLD_VIRTUAL_PATH:-}"
+        export PATH
+        unset _OLD_VIRTUAL_PATH
+    fi
+    if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
+        PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
+        export PYTHONHOME
+        unset _OLD_VIRTUAL_PYTHONHOME
+    fi
+
+    # Call hash to forget past commands. Without forgetting
+    # past commands the $PATH changes we made may not be respected
+    hash -r 2> /dev/null
+
+    if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
+        PS1="${_OLD_VIRTUAL_PS1:-}"
+        export PS1
+        unset _OLD_VIRTUAL_PS1
+    fi
+
+    unset VIRTUAL_ENV
+    unset VIRTUAL_ENV_PROMPT
+    if [ ! "${1:-}" = "nondestructive" ] ; then
+    # Self destruct!
+        unset -f deactivate
+    fi
+}
+
+# unset irrelevant variables
+deactivate nondestructive
+
+# on Windows, a path can contain colons and backslashes and has to be converted:
+if [ "${OSTYPE:-}" = "cygwin" ] || [ "${OSTYPE:-}" = "msys" ] ; then
+    # transform D:\path\to\venv to /d/path/to/venv on MSYS
+    # and to /cygdrive/d/path/to/venv on Cygwin
+    export VIRTUAL_ENV=$(cygpath /home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia)
+else
+    # use the path as-is
+    export VIRTUAL_ENV=/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia
+fi
+
+_OLD_VIRTUAL_PATH="$PATH"
+PATH="$VIRTUAL_ENV/"bin":$PATH"
+export PATH
+
+# unset PYTHONHOME if set
+# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
+# could use `if (set -u; : $PYTHONHOME) ;` in bash
+if [ -n "${PYTHONHOME:-}" ] ; then
+    _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
+    unset PYTHONHOME
+fi
+
+if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
+    _OLD_VIRTUAL_PS1="${PS1:-}"
+    PS1='(projeto-raia) '"${PS1:-}"
+    export PS1
+    VIRTUAL_ENV_PROMPT='(projeto-raia) '
+    export VIRTUAL_ENV_PROMPT
+fi
+
+# Call hash to forget past commands. Without forgetting
+# past commands the $PATH changes we made may not be respected
+hash -r 2> /dev/null
diff --git a/projeto-raia/bin/activate.csh b/projeto-raia/bin/activate.csh
new file mode 100644
index 0000000..62b36bd
--- /dev/null
+++ b/projeto-raia/bin/activate.csh
@@ -0,0 +1,27 @@
+# This file must be used with "source bin/activate.csh" *from csh*.
+# You cannot run it directly.
+
+# Created by Davide Di Blasi <davidedb@gmail.com>.
+# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
+
+alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
+
+# Unset irrelevant variables.
+deactivate nondestructive
+
+setenv VIRTUAL_ENV /home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia
+
+set _OLD_VIRTUAL_PATH="$PATH"
+setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
+
+
+set _OLD_VIRTUAL_PROMPT="$prompt"
+
+if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
+    set prompt = '(projeto-raia) '"$prompt"
+    setenv VIRTUAL_ENV_PROMPT '(projeto-raia) '
+endif
+
+alias pydoc python -m pydoc
+
+rehash
diff --git a/projeto-raia/bin/activate.fish b/projeto-raia/bin/activate.fish
new file mode 100644
index 0000000..803ed0d
--- /dev/null
+++ b/projeto-raia/bin/activate.fish
@@ -0,0 +1,69 @@
+# This file must be used with "source <venv>/bin/activate.fish" *from fish*
+# (https://fishshell.com/). You cannot run it directly.
+
+function deactivate  -d "Exit virtual environment and return to normal shell environment"
+    # reset old environment variables
+    if test -n "$_OLD_VIRTUAL_PATH"
+        set -gx PATH $_OLD_VIRTUAL_PATH
+        set -e _OLD_VIRTUAL_PATH
+    end
+    if test -n "$_OLD_VIRTUAL_PYTHONHOME"
+        set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
+        set -e _OLD_VIRTUAL_PYTHONHOME
+    end
+
+    if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
+        set -e _OLD_FISH_PROMPT_OVERRIDE
+        # prevents error when using nested fish instances (Issue #93858)
+        if functions -q _old_fish_prompt
+            functions -e fish_prompt
+            functions -c _old_fish_prompt fish_prompt
+            functions -e _old_fish_prompt
+        end
+    end
+
+    set -e VIRTUAL_ENV
+    set -e VIRTUAL_ENV_PROMPT
+    if test "$argv[1]" != "nondestructive"
+        # Self-destruct!
+        functions -e deactivate
+    end
+end
+
+# Unset irrelevant variables.
+deactivate nondestructive
+
+set -gx VIRTUAL_ENV /home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia
+
+set -gx _OLD_VIRTUAL_PATH $PATH
+set -gx PATH "$VIRTUAL_ENV/"bin $PATH
+
+# Unset PYTHONHOME if set.
+if set -q PYTHONHOME
+    set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
+    set -e PYTHONHOME
+end
+
+if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
+    # fish uses a function instead of an env var to generate the prompt.
+
+    # Save the current fish_prompt function as the function _old_fish_prompt.
+    functions -c fish_prompt _old_fish_prompt
+
+    # With the original prompt function renamed, we can override with our own.
+    function fish_prompt
+        # Save the return status of the last command.
+        set -l old_status $status
+
+        # Output the venv prompt; color taken from the blue of the Python logo.
+        printf "%s%s%s" (set_color 4B8BBE) '(projeto-raia) ' (set_color normal)
+
+        # Restore the return status of the previous command.
+        echo "exit $old_status" | .
+        # Output the original/"old" prompt.
+        _old_fish_prompt
+    end
+
+    set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
+    set -gx VIRTUAL_ENV_PROMPT '(projeto-raia) '
+end
diff --git a/projeto-raia/bin/distro b/projeto-raia/bin/distro
new file mode 100755
index 0000000..9dda94e
--- /dev/null
+++ b/projeto-raia/bin/distro
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from distro.distro import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/dotenv b/projeto-raia/bin/dotenv
new file mode 100755
index 0000000..3c3640a
--- /dev/null
+++ b/projeto-raia/bin/dotenv
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from dotenv.__main__ import cli
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(cli())
diff --git a/projeto-raia/bin/filetype b/projeto-raia/bin/filetype
new file mode 100755
index 0000000..6baebd3
--- /dev/null
+++ b/projeto-raia/bin/filetype
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from filetype.__main__ import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/httpx b/projeto-raia/bin/httpx
new file mode 100755
index 0000000..1a4903a
--- /dev/null
+++ b/projeto-raia/bin/httpx
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from httpx import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/jsondiff b/projeto-raia/bin/jsondiff
new file mode 100755
index 0000000..967b5c0
--- /dev/null
+++ b/projeto-raia/bin/jsondiff
@@ -0,0 +1,41 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import sys
+import json
+import jsonpatch
+import argparse
+
+
+parser = argparse.ArgumentParser(description='Diff two JSON files')
+parser.add_argument('FILE1', type=argparse.FileType('r'))
+parser.add_argument('FILE2', type=argparse.FileType('r'))
+parser.add_argument('--indent', type=int, default=None,
+                    help='Indent output by n spaces')
+parser.add_argument('-u', '--preserve-unicode', action='store_true',
+                    help='Output Unicode character as-is without using Code Point')
+parser.add_argument('-v', '--version', action='version',
+                    version='%(prog)s ' + jsonpatch.__version__)
+
+
+def main():
+    try:
+        diff_files()
+    except KeyboardInterrupt:
+        sys.exit(1)
+
+
+def diff_files():
+    """ Diffs two JSON files and prints a patch """
+    args = parser.parse_args()
+    doc1 = json.load(args.FILE1)
+    doc2 = json.load(args.FILE2)
+    patch = jsonpatch.make_patch(doc1, doc2)
+    if patch.patch:
+        print(json.dumps(patch.patch, indent=args.indent, ensure_ascii=not(args.preserve_unicode)))
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/projeto-raia/bin/jsonpatch b/projeto-raia/bin/jsonpatch
new file mode 100755
index 0000000..baaf531
--- /dev/null
+++ b/projeto-raia/bin/jsonpatch
@@ -0,0 +1,107 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import json
+import jsonpatch
+import tempfile
+import argparse
+
+
+parser = argparse.ArgumentParser(
+    description='Apply a JSON patch on a JSON file')
+parser.add_argument('ORIGINAL', type=argparse.FileType('r'),
+                    help='Original file')
+parser.add_argument('PATCH', type=argparse.FileType('r'),
+                    nargs='?', default=sys.stdin,
+                    help='Patch file (read from stdin if omitted)')
+parser.add_argument('--indent', type=int, default=None,
+                    help='Indent output by n spaces')
+parser.add_argument('-b', '--backup', action='store_true',
+                    help='Back up ORIGINAL if modifying in-place')
+parser.add_argument('-i', '--in-place', action='store_true',
+                    help='Modify ORIGINAL in-place instead of to stdout')
+parser.add_argument('-v', '--version', action='version',
+                    version='%(prog)s ' + jsonpatch.__version__)
+parser.add_argument('-u', '--preserve-unicode', action='store_true',
+                    help='Output Unicode character as-is without using Code Point')
+
+def main():
+    try:
+        patch_files()
+    except KeyboardInterrupt:
+        sys.exit(1)
+
+
+def patch_files():
+    """ Diffs two JSON files and prints a patch """
+    args = parser.parse_args()
+    doc = json.load(args.ORIGINAL)
+    patch = json.load(args.PATCH)
+    result = jsonpatch.apply_patch(doc, patch)
+
+    if args.in_place:
+        dirname = os.path.abspath(os.path.dirname(args.ORIGINAL.name))
+
+        try:
+            # Attempt to replace the file atomically.  We do this by
+            # creating a temporary file in the same directory as the
+            # original file so we can atomically move the new file over
+            # the original later.  (This is done in the same directory
+	    # because atomic renames do not work across mount points.)
+
+            fd, pathname = tempfile.mkstemp(dir=dirname)
+            fp = os.fdopen(fd, 'w')
+            atomic = True
+
+        except OSError:
+            # We failed to create the temporary file for an atomic
+            # replace, so fall back to non-atomic mode by backing up
+            # the original (if desired) and writing a new file.
+
+            if args.backup:
+                os.rename(args.ORIGINAL.name, args.ORIGINAL.name + '.orig')
+            fp = open(args.ORIGINAL.name, 'w')
+            atomic = False
+
+    else:
+        # Since we're not replacing the original file in-place, write
+        # the modified JSON to stdout instead.
+
+        fp = sys.stdout
+
+    # By this point we have some sort of file object we can write the 
+    # modified JSON to.
+    
+    json.dump(result, fp, indent=args.indent, ensure_ascii=not(args.preserve_unicode))
+    fp.write('\n')
+
+    if args.in_place:
+        # Close the new file.  If we aren't replacing atomically, this
+        # is our last step, since everything else is already in place.
+
+        fp.close()
+
+        if atomic:
+            try:
+                # Complete the atomic replace by linking the original
+                # to a backup (if desired), fixing up the permissions
+                # on the temporary file, and moving it into place.
+
+                if args.backup:
+                    os.link(args.ORIGINAL.name, args.ORIGINAL.name + '.orig')
+                os.chmod(pathname, os.stat(args.ORIGINAL.name).st_mode)
+                os.rename(pathname, args.ORIGINAL.name)
+
+            except OSError:
+                # In the event we could not actually do the atomic
+                # replace, unlink the original to move it out of the
+                # way and finally move the temporary file into place.
+                
+                os.unlink(args.ORIGINAL.name)
+                os.rename(pathname, args.ORIGINAL.name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projeto-raia/bin/jsonpointer b/projeto-raia/bin/jsonpointer
new file mode 100755
index 0000000..a8a7614
--- /dev/null
+++ b/projeto-raia/bin/jsonpointer
@@ -0,0 +1,66 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+
+
+import argparse
+import json
+import sys
+
+import jsonpointer
+
+parser = argparse.ArgumentParser(
+    description='Resolve a JSON pointer on JSON files')
+
+# Accept pointer as argument or as file
+ptr_group = parser.add_mutually_exclusive_group(required=True)
+
+ptr_group.add_argument('-f', '--pointer-file', type=argparse.FileType('r'),
+                       nargs='?',
+                       help='File containing a JSON pointer expression')
+
+ptr_group.add_argument('POINTER', type=str, nargs='?',
+                       help='A JSON pointer expression')
+
+parser.add_argument('FILE', type=argparse.FileType('r'), nargs='+',
+                    help='Files for which the pointer should be resolved')
+parser.add_argument('--indent', type=int, default=None,
+                    help='Indent output by n spaces')
+parser.add_argument('-v', '--version', action='version',
+                    version='%(prog)s ' + jsonpointer.__version__)
+
+
+def main():
+    try:
+        resolve_files()
+    except KeyboardInterrupt:
+        sys.exit(1)
+
+
+def parse_pointer(args):
+    if args.POINTER:
+        ptr = args.POINTER
+    elif args.pointer_file:
+        ptr = args.pointer_file.read().strip()
+    else:
+        parser.print_usage()
+        sys.exit(1)
+
+    return ptr
+
+
+def resolve_files():
+    """ Resolve a JSON pointer on JSON files """
+    args = parser.parse_args()
+
+    ptr = parse_pointer(args)
+
+    for f in args.FILE:
+        doc = json.load(f)
+        try:
+            result = jsonpointer.resolve_pointer(doc, ptr)
+            print(json.dumps(result, indent=args.indent))
+        except jsonpointer.JsonPointerException as e:
+            print('Could not resolve pointer: %s' % str(e), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projeto-raia/bin/normalizer b/projeto-raia/bin/normalizer
new file mode 100755
index 0000000..e8e795d
--- /dev/null
+++ b/projeto-raia/bin/normalizer
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from charset_normalizer.cli import cli_detect
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(cli_detect())
diff --git a/projeto-raia/bin/pip b/projeto-raia/bin/pip
new file mode 100755
index 0000000..80760a2
--- /dev/null
+++ b/projeto-raia/bin/pip
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from pip._internal.cli.main import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/pip3 b/projeto-raia/bin/pip3
new file mode 100755
index 0000000..80760a2
--- /dev/null
+++ b/projeto-raia/bin/pip3
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from pip._internal.cli.main import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/pip3.12 b/projeto-raia/bin/pip3.12
new file mode 100755
index 0000000..80760a2
--- /dev/null
+++ b/projeto-raia/bin/pip3.12
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from pip._internal.cli.main import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/py.test b/projeto-raia/bin/py.test
new file mode 100755
index 0000000..9943710
--- /dev/null
+++ b/projeto-raia/bin/py.test
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from pytest import console_main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(console_main())
diff --git a/projeto-raia/bin/pygmentize b/projeto-raia/bin/pygmentize
new file mode 100755
index 0000000..559002d
--- /dev/null
+++ b/projeto-raia/bin/pygmentize
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from pygments.cmdline import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/bin/pytest b/projeto-raia/bin/pytest
new file mode 100755
index 0000000..9943710
--- /dev/null
+++ b/projeto-raia/bin/pytest
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from pytest import console_main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(console_main())
diff --git a/projeto-raia/bin/python b/projeto-raia/bin/python
new file mode 120000
index 0000000..b8a0adb
--- /dev/null
+++ b/projeto-raia/bin/python
@@ -0,0 +1 @@
+python3
\ No newline at end of file
diff --git a/projeto-raia/bin/python3 b/projeto-raia/bin/python3
new file mode 120000
index 0000000..ae65fda
--- /dev/null
+++ b/projeto-raia/bin/python3
@@ -0,0 +1 @@
+/usr/bin/python3
\ No newline at end of file
diff --git a/projeto-raia/bin/python3.12 b/projeto-raia/bin/python3.12
new file mode 120000
index 0000000..b8a0adb
--- /dev/null
+++ b/projeto-raia/bin/python3.12
@@ -0,0 +1 @@
+python3
\ No newline at end of file
diff --git a/projeto-raia/bin/websockets b/projeto-raia/bin/websockets
new file mode 100755
index 0000000..e9a8e4a
--- /dev/null
+++ b/projeto-raia/bin/websockets
@@ -0,0 +1,8 @@
+#!/home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+from websockets.cli import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/projeto-raia/lib64 b/projeto-raia/lib64
new file mode 120000
index 0000000..7951405
--- /dev/null
+++ b/projeto-raia/lib64
@@ -0,0 +1 @@
+lib
\ No newline at end of file
diff --git a/projeto-raia/pyvenv.cfg b/projeto-raia/pyvenv.cfg
new file mode 100644
index 0000000..ff5363e
--- /dev/null
+++ b/projeto-raia/pyvenv.cfg
@@ -0,0 +1,5 @@
+home = /usr/bin
+include-system-site-packages = false
+version = 3.12.3
+executable = /usr/bin/python3.12
+command = /usr/bin/python3 -m venv /home/jonasmelo/ProjectsAndStudies/TextToInsight/projeto-raia
diff --git a/reports/meu_teste.csv b/reports/meu_teste.csv
new file mode 100644
index 0000000..61890b3
--- /dev/null
+++ b/reports/meu_teste.csv
@@ -0,0 +1,4 @@
+id_exemplo,tentativa_numero,db_id,pergunta_usuario,query_ouro_spider,query_agente_tentativa,tempo_agente_ms,veredito_critico,feedback_critico_recebido,erro_execucao,resultado_exato_match,similarity_score_sql
+1,1,concert_singer,What is the name and country of origin of every singer who has a song with the word 'Hey' in its title?,"SELECT name ,  country FROM singer WHERE song_name LIKE '%Hey%'","SELECT Name, Country FROM singer WHERE Song_Name LIKE '%Hey%'",12383.0,reprovado,"A consulta SQL assume que a coluna `Song_Name` existe diretamente na tabela `singer`, o que é uma estrutura de banco de dados incomum para representar múltiplas músicas de um cantor. Para responder adequadamente à pergunta, que implica que um cantor pode ter várias músicas, seria necessário fazer um `JOIN` entre as tabelas `singer` e `song` (assumindo uma tabela `song` separada). A query gerada pode falhar em encontrar todos os cantores relevantes se a coluna `Song_Name` na tabela `singer` não representar todas as músicas do cantor ou se não existir.",,True,0.9919
+1,2,concert_singer,What is the name and country of origin of every singer who has a song with the word 'Hey' in its title?,"SELECT name ,  country FROM singer WHERE song_name LIKE '%Hey%'","SELECT Name, Country FROM singer WHERE Song_Name LIKE '%Hey%'",29157.91,reprovado,"A consulta SQL assume que a coluna `Song_Name` existe diretamente na tabela `singer`. Em um esquema de banco de dados relacional típico, as músicas estariam em uma tabela separada (`song`) e ligadas aos cantores por uma chave estrangeira, exigindo uma operação `JOIN`. A consulta gerada não reflete essa estrutura comum e, portanto, não responde adequadamente à pergunta que implica uma relação entre cantores e músicas.",,True,0.9919
+1,3,concert_singer,What is the name and country of origin of every singer who has a song with the word 'Hey' in its title?,"SELECT name ,  country FROM singer WHERE song_name LIKE '%Hey%'","SELECT DISTINCT Name, Country FROM singer WHERE Song_Name LIKE '%Hey%'",65847.0,aprovado,"A consulta SQL responde corretamente à pergunta do usuário, selecionando o nome e o país dos cantores que possuem músicas com 'Hey' no título e utilizando `DISTINCT` para evitar duplicatas. Os resultados fazem sentido e a consulta foi executada com sucesso, indicando que a coluna `Song_Name` existe na tabela `singer` conforme a interpretação da IA.",,True,0.9242
diff --git a/scripts/test_spider_eval.py b/scripts/test_spider_eval.py
new file mode 100644
index 0000000..365f407
--- /dev/null
+++ b/scripts/test_spider_eval.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python3
+"""
+Script de Avaliação do Agente contra Spider Dataset.
+
+Testa o agente Text-to-Insight contra perguntas reais do Spider dataset,
+rastreando cada tentativa (quando o crítico reprova e volta ao planejador).
+
+Uso:
+    python scripts/test_spider_eval.py --sample-size 10 --seed 42
+    python scripts/test_spider_eval.py --db-filter concert_singer --output reports/eval.csv
+"""
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from dotenv import load_dotenv
+
+# Importar módulos do projeto
+from src.graph import Graph
+from src.spider.csv_reporter import CSVReporter
+from src.spider.data_loader import (
+    filter_by_db_id,
+    get_unique_db_ids,
+    load_spider_dev_examples,
+    sample_examples,
+)
+from src.spider.metrics import (
+    build_comparison_row,
+    results_exact_match,
+    sql_similarity_score,
+)
+from src.spider.query_executor import SpiderQueryExecutor
+
+load_dotenv()
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Avaliar agente Text-to-Insight contra Spider dataset"
+    )
+    parser.add_argument(
+        "--sample-size",
+        type=int,
+        default=10,
+        help="Quantas perguntas testar (default: 10)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Seed para reproducibilidade (default: 42)",
+    )
+    parser.add_argument(
+        "--db-filter",
+        type=str,
+        help="Filtrar por banco específico (ex: concert_singer)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Caminho para salvar CSV (default: reports/spider_eval_TIMESTAMP.csv)",
+    )
+    parser.add_argument(
+        "--max-attempts",
+        type=int,
+        default=3,
+        help="Máximo de tentativas por pergunta (default: 3)",
+    )
+    parser.add_argument(
+        "--data-dir",
+        type=str,
+        default="data/spider_data/spider_data",
+        help="Diretório com dados do Spider",
+    )
+
+    args = parser.parse_args()
+
+    # Validar API key
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        print("❌ Erro: GOOGLE_API_KEY não encontrada em .env")
+        sys.exit(1)
+
+    # 1. Carregar dados
+    print(f"\n📂 Carregando exemplos do Spider de {args.data_dir}...")
+    try:
+        ejemplos = load_spider_dev_examples(args.data_dir)
+        print(f"✓ Carregados {len(ejemplos)} exemplos")
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        sys.exit(1)
+
+    # 2. Aplicar filtros
+    if args.db_filter:
+        ejemplos = filter_by_db_id(ejemplos, args.db_filter)
+        print(f"✓ Filtrados por db_id={args.db_filter}: {len(ejemplos)} exemplos")
+
+    # 3. Fazer sampling
+    ejemplos = sample_examples(ejemplos, sample_size=args.sample_size, seed=args.seed)
+    print(
+        f"✓ Selecionados {len(ejemplos)} exemplos (seed={args.seed}, "
+        f"bancos únicos: {len(get_unique_db_ids(ejemplos))})"
+    )
+
+    # 4. Inicializar componentes
+    print("\n🔧 Inicializando componentes...")
+    try:
+        grafo = Graph(api_key)
+        print("✓ Grafo LangGraph inicializado")
+    except Exception as e:
+        print(f"❌ Erro ao inicializar grafo: {e}")
+        sys.exit(1)
+
+    executor = SpiderQueryExecutor()
+    print("✓ Query executor inicializado")
+
+    # 5. Preparar CSV
+    if args.output:
+        csv_path = args.output
+    else:
+        csv_path = f"reports/{CSVReporter.generate_timestamped_filename('spider_eval')}"
+
+    reporter = CSVReporter(csv_path)
+    print(f"✓ CSV reporter inicializado: {csv_path}")
+
+    # 6. Loop de testes
+    print(f"\n🚀 Iniciando avaliação com {len(ejemplos)} perguntas...\n")
+    print("=" * 100)
+
+    all_rows = []
+    ex_id = 1
+
+    for idx, ex in enumerate(ejemplos, 1):
+        pergunta = ex.get("question", "")
+        query_ouro = ex.get("query", "")
+        db_id = ex.get("db_id", "")
+
+        print(f"\n[{idx}/{len(ejemplos)}] Pergunta: {pergunta[:60]}...")
+        print(f"     DB: {db_id} | Query Ouro: {query_ouro[:50]}...")
+
+        # Executar query ouro para obter resultado esperado
+        print(f"     → Executando query ouro...")
+        resultado_ouro = executor.execute_query(db_id, query_ouro)
+
+        if not resultado_ouro["success"]:
+            print(f"     ⚠️  Erro na query ouro: {resultado_ouro['error']}")
+            continue  # Pular este exemplo
+
+        print(f"     ✓ Query ouro retornou {resultado_ouro['row_count']} linhas")
+
+        # Invocar grafo com stream() para rastrear tentativas
+        tentativa_numero = 1
+        estado_inicial = {
+            "pergunta_usuario": pergunta,
+            "db_path": str(executor.get_db_path(db_id)),
+            "contexto_schema": "",
+            "sql_gerada": "",
+            "linhas_resultado_preview": [],
+            "total_linhas_resultado": 0,
+            "erro_execucao": "",
+            "saida_terminal": "",
+            "feedback_critico": "",
+            "status": "iniciado",
+            "tentativas_loop": 0,
+        }
+
+        print(f"     → Invocando agente (max tentativas: {args.max_attempts})...")
+        inicio_agente = time.time()
+        full_estado = estado_inicial.copy()  # Manter estado acumulado
+
+        try:
+            for output in grafo.stream(
+                estado_inicial, config={"recursion_limit": 30}
+            ):
+                # stream() retorna dict: {'nó_name': {mudanças_do_nó}}
+                # Acumular mudanças no estado completo
+                for node_name, mudancas in output.items():
+                    full_estado.update(mudancas)
+                    
+                    # Após crítico retornar, coletar métricas e salvar
+                    if "critic" in node_name.lower():
+                        tempo_tentativa = (time.time() - inicio_agente) * 1000
+
+                        query_agente = full_estado.get("sql_gerada", "")
+                        veredito = full_estado.get("status", "")
+                        feedback_estado = full_estado.get("feedback_critico", "")
+                        erro_exec = full_estado.get("erro_execucao", "")
+                        tentativas = full_estado.get("tentativas_loop", 1)
+
+                        # Mapear status para veredito e definir feedback
+                        if veredito == "aprovado":
+                            veredito_critico = "aprovado"
+                            # Se aprovado, feedback é confirmação
+                            feedback_critico = feedback_estado if feedback_estado else "Aprovado"
+                        elif veredito == "reprovado":
+                            veredito_critico = "reprovado"
+                            # Se reprovado, usar feedback do crítico
+                            feedback_critico = feedback_estado if feedback_estado else "Reprovado pelo crítico"
+                        else:
+                            veredito_critico = "erro"
+                            feedback_critico = feedback_estado if feedback_estado else "Erro na avaliação"
+
+                        # Comparar resultados se query agente foi gerada
+                        resultado_exato_match = None
+                        similarity_score = 0.0
+
+                        if query_agente and not erro_exec:
+                            resultado_agente = executor.execute_query(db_id, query_agente)
+                            if resultado_agente["success"]:
+                                resultado_exato_match = results_exact_match(
+                                    resultado_ouro["results"],
+                                    resultado_agente["results"],
+                                )
+                                similarity_score = sql_similarity_score(query_ouro, query_agente)
+                                print(
+                                    f"       Tentativa {tentativas}: "
+                                    f"similarity={similarity_score:.2f}, "
+                                    f"match={resultado_exato_match}, "
+                                    f"veredito={veredito_critico}"
+                                )
+                            else:
+                                erro_exec = resultado_agente["error"]
+                        else:
+                            print(
+                                f"       Tentativa {tentativas}: "
+                                f"sem query gerada ou com erro de execução"
+                            )
+
+                        # Construir linha para CSV
+                        row = build_comparison_row(
+                            id_exemplo=ex_id,
+                            tentativa_numero=tentativas,
+                            db_id=db_id,
+                            pergunta=pergunta,
+                            query_ouro=query_ouro,
+                            query_agente=query_agente,
+                            tempo_agente_ms=tempo_tentativa,
+                            veredito_critico=veredito_critico,
+                            feedback_critico=feedback_critico,
+                            erro_execucao=erro_exec,
+                            resultado_exato_match=resultado_exato_match,
+                            similarity_score=similarity_score,
+                        )
+
+                        reporter.append_row(row)
+                        all_rows.append(row)
+
+                        # Se aprovado, terminar loop
+                        if veredito_critico == "aprovado":
+                            print(f"     ✅ APROVADO na tentativa {tentativas}")
+                            break
+                        elif tentativas >= args.max_attempts:
+                            print(f"     ❌ MÁXIMO DE TENTATIVAS ({args.max_attempts}) ATINGIDO")
+                            break
+
+        except Exception as e:
+            print(f"     ⚠️  Erro ao processar pergunta: {str(e)}")
+            continue
+
+        ex_id += 1
+        time.sleep(1)  # Delay entre perguntas
+
+    # 7. Gerar resumo
+    print("\n" + "=" * 100)
+    print("📊 RESUMO FINAL")
+    print("=" * 100)
+
+    if all_rows:
+        summary = reporter.generate_summary(all_rows)
+        print(f"Total de perguntas: {summary['total_perguntas']}")
+        print(f"Total de tentativas: {summary['total_tentativas']}")
+        print(f"Perguntas aprovadas: {summary['perguntas_aprovadas']}")
+        print(f"Taxa de aprovação: {summary['taxa_aprovacao']:.1%}")
+        print(f"Taxa de sucesso na 1ª tentativa: {summary['taxa_1a_tentativa']:.1%}")
+        print(f"Tentativas médias por pergunta: {summary['tentativas_media']:.2f}")
+        print(f"Similarity score médio: {summary['similarity_media']:.4f}")
+        print(f"Tempo médio por tentativa: {summary['tempo_medio_ms']:.2f} ms")
+        print(f"\n✅ CSV salvo em: {csv_path}")
+    else:
+        print("❌ Nenhum resultado para salvar")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/graph.py b/src/graph.py
index cd2f3a3..d9433ec 100644
--- a/src/graph.py
+++ b/src/graph.py
@@ -99,3 +99,18 @@ def _compilar_grafo(self) -> "CompiledStateGraph":
 
     def invoke(self, estado: EstadoTextToInsight):
         return self.grafo_text_to_insight.invoke(estado)
+
+    def stream(self, estado: EstadoTextToInsight, config: dict = None):
+        """
+        Executa o grafo em modo streaming, yieldando estado após cada nó.
+
+        Args:
+            estado: Estado inicial
+            config: Configurações (ex: recursion_limit)
+
+        Yields:
+            Dicts com saída de cada nó
+        """
+        if config is None:
+            config = {}
+        return self.grafo_text_to_insight.stream(estado, config)
diff --git a/src/spider/__init__.py b/src/spider/__init__.py
new file mode 100644
index 0000000..e9c82c2
--- /dev/null
+++ b/src/spider/__init__.py
@@ -0,0 +1,9 @@
+"""
+Módulo Spider: Integração com dataset Spider para avaliação de queries SQL.
+
+Submódulos:
+- data_loader: Carregar exemplos de dev.json
+- query_executor: Executar queries em bancos SQLite do spider
+- metrics: Comparar queries (similarity score, resultado exato)
+- csv_reporter: Salvar métricas em CSV por tentativa
+"""
diff --git a/src/spider/csv_reporter.py b/src/spider/csv_reporter.py
new file mode 100644
index 0000000..8ab16b0
--- /dev/null
+++ b/src/spider/csv_reporter.py
@@ -0,0 +1,149 @@
+"""
+Reporter de CSV para resultados de avaliação Spider.
+
+Fornece:
+- Inicializar CSV com header
+- Salvar linhas de tentativas
+- Gerar resumo final
+"""
+
+import csv
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+class CSVReporter:
+    """Gerenciador de CSV para rastreamento de tentativas."""
+
+    HEADERS = [
+        "id_exemplo",
+        "tentativa_numero",
+        "db_id",
+        "pergunta_usuario",
+        "query_ouro_spider",
+        "query_agente_tentativa",
+        "tempo_agente_ms",
+        "veredito_critico",
+        "feedback_critico_recebido",
+        "erro_execucao",
+        "resultado_exato_match",
+        "similarity_score_sql",
+    ]
+
+    def __init__(self, filepath: str | Path):
+        """
+        Inicializa reporter.
+
+        Args:
+            filepath: Caminho para arquivo CSV
+        """
+        self.filepath = Path(filepath)
+        self.filepath.parent.mkdir(parents=True, exist_ok=True)
+
+        # Inicializar CSV com headers
+        with open(self.filepath, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=self.HEADERS)
+            writer.writeheader()
+
+    def append_row(self, row: dict[str, Any]) -> None:
+        """
+        Adiciona uma linha ao CSV.
+
+        Args:
+            row: Dict com 12 chaves (id_exemplo, tentativa_numero, etc)
+
+        Raises:
+            ValueError: Se alguma chave obrigatória está faltando
+        """
+        # Validar chaves
+        missing = set(self.HEADERS) - set(row.keys())
+        if missing:
+            raise ValueError(f"Chaves obrigatórias faltando: {missing}")
+
+        with open(self.filepath, "a", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=self.HEADERS)
+            writer.writerow(row)
+
+    def generate_summary(self, rows: list[dict[str, Any]]) -> dict[str, Any]:
+        """
+        Gera resumo estatístico dos resultados.
+
+        Args:
+            rows: Lista de linhas do CSV
+
+        Returns:
+            Dict com estatísticas
+        """
+        if not rows:
+            return {
+                "total_perguntas": 0,
+                "total_tentativas": 0,
+                "perguntas_aprovadas": 0,
+                "taxa_aprovacao": 0.0,
+                "taxa_1a_tentativa": 0.0,
+                "tentativas_media": 0.0,
+                "similarity_media": 0.0,
+                "tempo_medio_ms": 0.0,
+            }
+
+        # Agrupar por id_exemplo
+        by_exemplo = {}
+        for row in rows:
+            ex_id = row["id_exemplo"]
+            if ex_id not in by_exemplo:
+                by_exemplo[ex_id] = []
+            by_exemplo[ex_id].append(row)
+
+        total_perguntas = len(by_exemplo)
+        perguntas_aprovadas = 0
+        perguntas_1a_tentativa = 0
+        total_tentativas = len(rows)
+        similarities = []
+        tempos = []
+
+        for ex_id, tentativas in by_exemplo.items():
+            # Última tentativa desta pergunta
+            ultima = tentativas[-1]
+
+            if ultima["veredito_critico"] == "aprovado":
+                perguntas_aprovadas += 1
+
+            if len(tentativas) == 1 and ultima["veredito_critico"] == "aprovado":
+                perguntas_1a_tentativa += 1
+
+            # Coletar similarity scores (de tentativas bem-sucedidas)
+            for tent in tentativas:
+                if tent["similarity_score_sql"]:
+                    similarities.append(float(tent["similarity_score_sql"]))
+                if tent["tempo_agente_ms"]:
+                    tempos.append(float(tent["tempo_agente_ms"]))
+
+        return {
+            "total_perguntas": total_perguntas,
+            "total_tentativas": total_tentativas,
+            "perguntas_aprovadas": perguntas_aprovadas,
+            "taxa_aprovacao": (
+                perguntas_aprovadas / total_perguntas if total_perguntas > 0 else 0.0
+            ),
+            "taxa_1a_tentativa": (
+                perguntas_1a_tentativa / total_perguntas if total_perguntas > 0 else 0.0
+            ),
+            "tentativas_media": total_tentativas / total_perguntas if total_perguntas > 0 else 0.0,
+            "similarity_media": sum(similarities) / len(similarities) if similarities else 0.0,
+            "tempo_medio_ms": sum(tempos) / len(tempos) if tempos else 0.0,
+        }
+
+    @staticmethod
+    def generate_timestamped_filename(prefix: str = "spider_eval") -> str:
+        """
+        Gera nome de arquivo com timestamp.
+
+        Args:
+            prefix: Prefixo do arquivo
+
+        Returns:
+            Nome como: spider_eval_2025-04-06_14-30-45.csv
+        """
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        return f"{prefix}_{timestamp}.csv"
diff --git a/src/spider/data_loader.py b/src/spider/data_loader.py
new file mode 100644
index 0000000..8c5297e
--- /dev/null
+++ b/src/spider/data_loader.py
@@ -0,0 +1,96 @@
+"""
+Carregador de dados do Spider dataset.
+
+Fornece funcionalidades para:
+- Carregar exemplos de dev.json (pergunta, query_ouro, db_id)
+- Fazer sampling reproducível com seed
+- Filtrar por banco de dados específico
+"""
+
+import json
+import random
+from pathlib import Path
+from typing import Any
+
+
+def load_spider_dev_examples(data_dir: str = "data/spider_data/spider_data") -> list[dict[str, Any]]:
+    """
+    Carrega exemplos de dev.json do dataset Spider.
+
+    Args:
+        data_dir: Caminho para o diretório com dados do spider
+
+    Returns:
+        Lista de dicts com chaves: db_id, question, query
+
+    Raises:
+        FileNotFoundError: Se dev.json não existir
+        json.JSONDecodeError: Se arquivo está malformado
+    """
+    dev_path = Path(data_dir) / "dev.json"
+
+    if not dev_path.exists():
+        raise FileNotFoundError(
+            f"dev.json não encontrado em {dev_path}. "
+            f"Certifique-se que está em data/spider_data/spider_data/"
+        )
+
+    with open(dev_path, "r") as f:
+        examples = json.load(f)
+
+    return examples
+
+
+def sample_examples(
+    examples: list[dict[str, Any]],
+    sample_size: int | None = None,
+    seed: int | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Faz sampling reproducível dos exemplos.
+
+    Args:
+        examples: Lista de exemplos
+        sample_size: Quantos exemplos pegar (None = todos)
+        seed: Seed para reproducibilidade
+
+    Returns:
+        Lista de exemplos selecionados
+    """
+    if seed is not None:
+        random.seed(seed)
+
+    if sample_size is None or sample_size >= len(examples):
+        return examples
+
+    return random.sample(examples, k=sample_size)
+
+
+def filter_by_db_id(
+    examples: list[dict[str, Any]],
+    db_id: str,
+) -> list[dict[str, Any]]:
+    """
+    Filtra exemplos por banco de dados.
+
+    Args:
+        examples: Lista de exemplos
+        db_id: ID do banco (ex: concert_singer)
+
+    Returns:
+        Lista de exemplos do banco especificado
+    """
+    return [ex for ex in examples if ex["db_id"] == db_id]
+
+
+def get_unique_db_ids(examples: list[dict[str, Any]]) -> list[str]:
+    """
+    Retorna lista de bancos únicos nos exemplos.
+
+    Args:
+        examples: Lista de exemplos
+
+    Returns:
+        Lista de db_ids únicos
+    """
+    return sorted(set(ex["db_id"] for ex in examples))
diff --git a/src/spider/metrics.py b/src/spider/metrics.py
new file mode 100644
index 0000000..6e82f30
--- /dev/null
+++ b/src/spider/metrics.py
@@ -0,0 +1,161 @@
+"""
+Métricas para comparação de queries SQL.
+
+Fornece:
+- Similarity score entre duas queries (difflib-based)
+- Comparação de resultados (exato match)
+- Normalização de SQL para comparação
+"""
+
+import difflib
+import re
+from typing import Any
+
+
+def normalize_sql(sql: str) -> str:
+    """
+    Normaliza SQL para comparação mais robusta.
+
+    - Remove espaços extras
+    - Converte para upper case
+    - Remove comentários
+    - Remove trailing semicolon
+
+    Args:
+        sql: SQL a normalizar
+
+    Returns:
+        SQL normalizado
+    """
+    # Remove comentários de linha
+    sql = re.sub(r"--.*$", "", sql, flags=re.MULTILINE)
+
+    # Remove comentários de bloco
+    sql = re.sub(r"/\*.*?\*/", "", sql, flags=re.DOTALL)
+
+    # Remove trailing semicolon
+    sql = sql.rstrip("; \n\t")
+
+    # Uppercase
+    sql = sql.upper()
+
+    # Remove espaços múltiplos
+    sql = re.sub(r"\s+", " ", sql).strip()
+
+    return sql
+
+
+def sql_similarity_score(sql1: str, sql2: str) -> float:
+    """
+    Calcula similarity score entre dois SQLs usando SequenceMatcher.
+
+    Args:
+        sql1: Primeira query
+        sql2: Segunda query
+
+    Returns:
+        Score de 0 a 1 (1 = idênticos)
+    """
+    norm1 = normalize_sql(sql1)
+    norm2 = normalize_sql(sql2)
+
+    # Se ambas vazias, considerar idênticas
+    if not norm1 and not norm2:
+        return 1.0
+
+    # Se uma vazia e outra não, completamente diferentes
+    if not norm1 or not norm2:
+        return 0.0
+
+    matcher = difflib.SequenceMatcher(None, norm1, norm2)
+    return matcher.ratio()
+
+
+def results_exact_match(
+    results_gold: list[dict[str, Any]],
+    results_agent: list[dict[str, Any]],
+) -> bool:
+    """
+    Compara se dois conjuntos de resultados são exatamente iguais.
+
+    Compara:
+    - Número de linhas
+    - Valores de cada linha (insensível a ordem das colunas)
+
+    Args:
+        results_gold: Resultados da query ouro
+        results_agent: Resultados da query do agente
+
+    Returns:
+        True se resultados são iguais
+    """
+    if len(results_gold) != len(results_agent):
+        return False
+
+    # Converter dicts para conjuntos de tuplas para comparação
+    # (para serem agnósticos à ordem das colunas)
+    def result_set(results: list[dict[str, Any]]) -> set:
+        converted = []
+        for row in results:
+            # Converter valores para strings para lidar com tipos diferentes
+            items = []
+            for k in sorted(row.keys()):
+                # Normalizar None/NULL
+                v = row[k]
+                if v is None:
+                    v = "NULL"
+                items.append((k, str(v)))
+            converted.append(tuple(items))
+        return set(converted)
+
+    return result_set(results_gold) == result_set(results_agent)
+
+
+def build_comparison_row(
+    id_exemplo: int,
+    tentativa_numero: int,
+    db_id: str,
+    pergunta: str,
+    query_ouro: str,
+    query_agente: str,
+    tempo_agente_ms: float,
+    veredito_critico: str,
+    feedback_critico: str,
+    erro_execucao: str,
+    resultado_exato_match: bool | None,
+    similarity_score: float,
+) -> dict[str, Any]:
+    """
+    Constrói uma linha para o CSV de avaliação.
+
+    Args:
+        id_exemplo: ID sequencial da pergunta
+        tentativa_numero: Qual tentativa (1, 2, 3...)
+        db_id: Banco de dados
+        pergunta: Pergunta em linguagem natural
+        query_ouro: Query padrão do spider
+        query_agente: Query gerada pelo agente NESTA tentativa
+        tempo_agente_ms: Tempo de execução em ms
+        veredito_critico: "aprovado" / "reprovado" / "erro"
+        feedback_critico: Feedback recebido (ou "Aprovado" se aprovado)
+        erro_execucao: Mensagem de erro (vazio se OK)
+        resultado_exato_match: True/False se resultado foi exato (None se erro)
+        similarity_score: Score 0-1
+
+    Returns:
+        Dict com 12 chaves para CSV
+    """
+    return {
+        "id_exemplo": id_exemplo,
+        "tentativa_numero": tentativa_numero,
+        "db_id": db_id,
+        "pergunta_usuario": pergunta,
+        "query_ouro_spider": query_ouro,
+        "query_agente_tentativa": query_agente,
+        "tempo_agente_ms": round(tempo_agente_ms, 2),
+        "veredito_critico": veredito_critico,
+        "feedback_critico_recebido": feedback_critico,
+        "erro_execucao": erro_execucao,
+        "resultado_exato_match": resultado_exato_match if resultado_exato_match is not None else "",
+        "similarity_score_sql": round(similarity_score, 4),
+    }
diff --git a/src/spider/query_executor.py b/src/spider/query_executor.py
new file mode 100644
index 0000000..c03142c
--- /dev/null
+++ b/src/spider/query_executor.py
@@ -0,0 +1,125 @@
+"""
+Executor de queries contra bancos SQLite do Spider dataset.
+
+Fornece funcionalidades para:
+- Conectar dinamicamente a bancos por db_id
+- Executar queries em modo read-only
+- Capturar resultados e erros
+"""
+
+import sqlite3
+import time
+from pathlib import Path
+from typing import Any
+
+
+class SpiderQueryExecutor:
+    """Executor de queries em bancos Spider com controle de timeout e segurança."""
+
+    def __init__(self, database_dir: str = "data/spider_data/spider_data/database"):
+        """
+        Inicializa executor.
+
+        Args:
+            database_dir: Diretório contendo subpastas com bancos SQLite
+        """
+        self.database_dir = Path(database_dir)
+
+    def get_db_path(self, db_id: str) -> Path:
+        """
+        Retorna caminho para banco específico.
+
+        Args:
+            db_id: ID do banco (ex: concert_singer)
+
+        Returns:
+            Caminho para .sqlite
+
+        Raises:
+            FileNotFoundError: Se banco não existe
+        """
+        db_path = self.database_dir / db_id / f"{db_id}.sqlite"
+        if not db_path.exists():
+            raise FileNotFoundError(f"Banco não encontrado: {db_path}")
+        return db_path
+
+    def execute_query(
+        self,
+        db_id: str,
+        sql: str,
+        timeout: int = 30,
+    ) -> dict[str, Any]:
+        """
+        Executa query em modo read-only contra um banco.
+
+        Args:
+            db_id: ID do banco
+            sql: SQL a executar
+            timeout: Timeout em segundos
+
+        Returns:
+            Dict com chaves:
+            - success: bool
+            - results: list[dict] (se sucesso)
+            - row_count: int (total de linhas, sem limit)
+            - error: str (se erro)
+            - time_ms: float (tempo de execução)
+        """
+        start_time = time.time()
+
+        try:
+            db_path = self.get_db_path(db_id)
+
+            # Conectar em modo read-only
+            connection_string = f"file:{db_path}?mode=ro&uri=true"
+            conn = sqlite3.connect(connection_string, timeout=timeout, uri=True)
+            conn.row_factory = sqlite3.Row  # Retornar dicts
+
+            cursor = conn.cursor()
+
+            # Executar query
+            cursor.execute(sql)
+            rows = cursor.fetchall()
+
+            # Converter para list[dict]
+            results = [dict(row) for row in rows]
+
+            conn.close()
+
+            elapsed_ms = (time.time() - start_time) * 1000
+
+            return {
+                "success": True,
+                "results": results,
+                "row_count": len(results),
+                "error": "",
+                "time_ms": elapsed_ms,
+            }
+
+        except sqlite3.Error as e:
+            elapsed_ms = (time.time() - start_time) * 1000
+            return {
+                "success": False,
+                "results": [],
+                "row_count": 0,
+                "error": f"SQLite error: {str(e)}",
+                "time_ms": elapsed_ms,
+            }
+        except FileNotFoundError as e:
+            elapsed_ms = (time.time() - start_time) * 1000
+            return {
+                "success": False,
+                "results": [],
+                "row_count": 0,
+                "error": f"Database not found: {str(e)}",
+                "time_ms": elapsed_ms,
+            }
+        except Exception as e:
+            elapsed_ms = (time.time() - start_time) * 1000
+            return {
+                "success": False,
+                "results": [],
+                "row_count": 0,
+                "error": f"Unexpected error: {str(e)}",
+                "time_ms": elapsed_ms,
+            }

From bef2d366a1e12b53dbbcb3f8572a716a36b5e0bf Mon Sep 17 00:00:00 2001
From: JonasMelo21 <jonashonorato4@gmail.com>
Date: Sat, 11 Apr 2026 08:54:59 -0300
Subject: [PATCH 02/10] docs: move Spider benchmark doc to
 src/spider/BENCHMARK.md

---
 DESENVOLVIMENTO.md                          | 157 ++----------------
 README.md                                   |  75 +--------
 reports/spider_eval_2026-04-11_08-44-42.csv |   2 +
 src/spider/BENCHMARK.md                     | 173 ++++++++++++++++++++
 4 files changed, 201 insertions(+), 206 deletions(-)
 create mode 100644 reports/spider_eval_2026-04-11_08-44-42.csv
 create mode 100644 src/spider/BENCHMARK.md

diff --git a/DESENVOLVIMENTO.md b/DESENVOLVIMENTO.md
index 17e5700..5021b90 100644
--- a/DESENVOLVIMENTO.md
+++ b/DESENVOLVIMENTO.md
@@ -75,128 +75,15 @@ Se um teste falha:
 - Falha na camada 2 → o nó específico que falhou está com problema
 - Camada 2 passa mas camada 3 falha → problema nos roteadores ou na conexão entre nós
 
-## Testes do módulo Spider
+## Módulo de avaliação com Spider Dataset
 
-```bash
-# Testar data loader
-python -c "from src.spider.data_loader import load_spider_dev_examples; ex = load_spider_dev_examples('data/spider_data/spider_data'); print(f'Carregado: {len(ex)} exemplos')"
-
-# Testar query executor
-python -c "from src.spider.query_executor import SpiderQueryExecutor; ex = SpiderQueryExecutor(); print(f'DBs encontrados: {len(ex.list_available_dbs())}')"
-
-# Testar metrics
-python -c "from src.spider.metrics import sql_similarity_score; print(f'Score: {sql_similarity_score(\\\"SELECT * FROM a\\\", \\\"select * from a\\\")}')"
-```
-
-## Arquitetura do módulo Spider
-
-O módulo reutiliza o grafo existente para avaliar cada pergunta do Spider dataset:
-
-### Fluxo por pergunta
-
-```
-1. Load: data_loader carrega dev.json (1.034 exemplos)
-   └─> Cada exemplo tem: {question, query (ouro), db_id}
-
-2. Query Ouro: SpiderQueryExecutor executa query original no banco
-   └─> Captura resultado esperado (baseline)
-
-3. Para cada tentativa (até 3x):
-   a) Estado Inicial: pergunta + schema do banco
-   b) Grafo Executa: planejador → schema → code_agent → sandbox → crítico
-   c) Stream Acumula: full_estado coleta mudanças de cada nó
-   d) Crítico Decide:
-      - Se aprovado: extrai query_agente, compara com ouro
-      - Se reprovado: volta ao planejador (retry)
-   e) CSV Registra: 1 linha por tentativa com todos os dados
-
-4. Resumo: CSVReporter calcula estatísticas (taxa aprovação, similarity média, etc)
-```
-
-### Pontos técnicos importantes
-
-- **State Accumulation**: O stream() retorna deltas por nó, não estado total. Script mantém `full_estado = estado_inicial.copy()` e atualiza com cada output
-- **Recursion Limit**: Config de `recursion_limit=30` permite planejador iterar até 3 vezes sem erro
-- **Database Path**: Spider stores databases em `data/spider_data/spider_data/database/{db_id}/{db_id}.sqlite`
-- **Read-only Mode**: SQLite conecta com `?mode=ro&uri=true` para evitar escrita
-- **Normalization**: SQL é normalizado (UPPER, sem comentários, sem whitespace) antes de comparar similarity
-
-### Customizando a avaliação
-
-Para testar com diferentes LLMs ou ajustar prompts:
-
-1. Editar `src/nodes/planner.py`, `code_agent.py`, `critic.py` conforme necessário
-2. Script USA o mesmo grafo (`src/graph.py`), então mudanças se refletem automaticamente
-3. Para testar especificamente um nó: reutilize o teste em `tests/test_nodes.py`
-
-## Avaliação com Spider Dataset
-
-O projeto inclui um sistema completo de avaliação contra o dataset Spider, que rastreia cada tentativa do agente quando o crítico reprova.
-
-### Setup
-
-```bash
-# Baixar o dataset Spider
-# Extrair em data/spider_data/
-# Estrutura esperada:
-# data/spider_data/spider_data/
-#   ├── dev.json
-#   ├── database/
-#   │   ├── concert_singer/
-#   │   ├── pets_1/
-#   │   └── ...
-```
-
-### Executar avaliação
-
-```bash
-# Teste simples com 10 perguntas
-python scripts/test_spider_eval.py
-
-# Com customizações
-python scripts/test_spider_eval.py \
-  --sample-size 50 \
-  --seed 42 \
-  --db-filter concert_singer \
-  --output reports/spider_eval.csv \
-  --max-attempts 5
-
-# Parâmetros:
-# --sample-size N       : Quantas perguntas testar (default: 10)
-# --seed SEED           : Seed para reproducibilidade (default: 42)
-# --db-filter DB_ID     : Filtrar por um banco (ex: concert_singer)
-# --output PATH         : Caminho CSV (default: reports/spider_eval_TIMESTAMP.csv)
-# --max-attempts N      : Máx tentativas por pergunta (default: 3)
-```
-
-### Entender os resultados
-
-O script gera um CSV com **uma linha por tentativa**:
-
-| Coluna | Significado |
-|--------|-------------|
-| tentativa_numero | 1ª, 2ª, 3ª tentativa desta pergunta |
-| veredito_critico | aprovado/reprovado/erro |
-| feedback_critico | Motivo da reprovação (para debug) |
-| similarity_score_sql | 0-1, similaridade com query ouro |
-| resultado_exato_match | True se resultado foi idêntico |
-| tempo_agente_ms | Quanto tempo levou aquela tentativa |
-
-Resumo final:
-```
-Total de perguntas: 100
-Perguntas aprovadas: 82
-Taxa de aprovação: 82.0%
-Taxa de sucesso na 1ª tentativa: 75.0%
-Tentativas médias por pergunta: 1.23
-Similarity score médio: 0.945
-Tempo médio por tentativa: 12500 ms
-```
+**Documentação completa em: [`src/spider/BENCHMARK.md`](src/spider/BENCHMARK.md)**
 
-**Análise**:
-- Taxa de 1ª tentativa baixa? → Agente gerando queries incorretas inicialmente
-- Similarity alto mas veredito reprovado? → Queries diferentes semanticamente
-- Muitas tentativas? → Crítico ou agente não aprendendo com feedback
+- O módulo reutiliza o grafo existente para avaliar perguntas contra o Spider Dataset
+- Rastreia cada tentativa (quando crítico reprova, volta ao planejador)
+- Componentes: data_loader, query_executor, metrics, csv_reporter
+- Orquestrador: `scripts/test_spider_eval.py`
+- Uso: `python scripts/test_spider_eval.py --sample-size 50 --db-filter concert_singer`
 
 ## Estrutura de arquivos
 
@@ -214,29 +101,19 @@ TextToInsight/
 ├── src/
 │   ├── state.py                   # EstadoTextToInsight (TypedDict)
 │   ├── graph.py                   # Grafo LangGraph
-│   ├── nodes/
-│   │   ├── planner.py             # Planejador (Gemini)
-│   │   ├── schema.py              # Extração de schema (SQLite)
-│   │   ├── code_agent/
-│   │   │   ├── code_agent.py      # Geração SQL (Gemini)
-│   │   │   └── code_sql.py        # Validação + execução SQL
-│   │   ├── sandbox.py             # Executor SQL (banco real)
-│   │   └── critic.py              # Avaliador (Gemini)
-│   ├── spider/                    # **Módulo de Avaliação Spider**
-│   │   ├── data_loader.py         # Carregar dev.json
-│   │   ├── query_executor.py      # Executar queries
-│   │   ├── metrics.py             # Similarity score, comparações
-│   │   └── csv_reporter.py        # Gerar CSV por tentativa
+│   ├── nodes/                     # Nós do grafo
+│   │   ├── planner.py, schema.py, code_agent/, sandbox.py, critic.py
+│   ├── spider/                    # Módulo de benchmark Spider
+│   │   ├── BENCHMARK.md           # 📖 Documentação técnica
+│   │   ├── data_loader.py         # Carrega dev.json
+│   │   ├── query_executor.py      # Executa queries
+│   │   ├── metrics.py             # Similarity e matching
+│   │   └── csv_reporter.py        # Gera CSV e resumo
 │   └── routers/
-│       └── edges.py               # Roteadores condicionais
 ├── scripts/
-│   └── test_spider_eval.py        # Script de avaliação Spider
-├── reports/
-│   └── spider_eval_*.csv          # Resultados das avaliações
+│   └── test_spider_eval.py        # Orquestrador do benchmark
+├── reports/                       # CSVs de avaliação
 └── tests/
-    ├── test_componentes.py        # Sem API
-    ├── test_nodes.py              # Com API, nó a nó
-    └── test_integracao.py         # Com API, grafo completo
 ```
 
 ## Criando um novo nó
diff --git a/README.md b/README.md
index 7de6682..ce81234 100644
--- a/README.md
+++ b/README.md
@@ -66,80 +66,23 @@ pytest tests/test_integracao.py -v -s
 
 ## Avaliação com Spider Dataset
 
-Sistema completo de avaliação que testa o agente contra o dataset Spider, rastreando cada tentativa (quando o crítico reprova e volta ao planejador).
+O projeto inclui um módulo de avaliação (`src/spider/`) que testa o agente contra o **Spider Dataset** (1.034 perguntas reais em SQL, 20 bancos diferentes), rastreando cada tentativa quando o crítico reprova.
 
-**Nota**: Spider Dataset é opcional. Use apenas se quiser avaliar contra 1.034 exemplos reais.
+**Para detalhes técnicos sobre o módulo, arquitetura e fluxo de debug, consulte: [`src/spider/BENCHMARK.md`](src/spider/BENCHMARK.md)**
 
-### Setup
+**Nota**: Spider Dataset é opcional.
 
-1. [Baixar Spider Dataset](https://drive.google.com/uc?export=download&id=1iYkIGr7MwuOvBkRkj4RKs6Ff3NMa7NMG)
-2. Descompactar em `data/spider_data/`
-3. Verificar estrutura:
-   ```
-   data/spider_data/spider_data/
-   ├── dev.json
-   ├── database/
-   │   ├── concert_singer/
-   │   ├── pets_1/
-   │   └── ... (20 bancos no total)
-   ```
-
-### Como usar
+### Setup rápido
 
 ```bash
-# Teste simples com 10 perguntas (padrão)
-python scripts/test_spider_eval.py
-
-# Com customizações
-python scripts/test_spider_eval.py \
-  --sample-size 50 \
-  --seed 42 \
-  --db-filter concert_singer \
-  --output reports/my_eval.csv
-
-# Parâmetros
-# --sample-size N       : Quantas perguntas testar (default: 10)
-# --seed SEED           : Seed para reproducibilidade (default: 42)
-# --db-filter DB_ID     : Filtrar por um banco específico
-# --output PATH         : Caminho para salvar CSV (default: reports/spider_eval_TIMESTAMP.csv)
-# --max-attempts N      : Máximo de tentativas por pergunta (default: 3)
-```
-
-### Saída
+# Baixar dataset Spider
+python scripts/test_spider_eval.py --sample-size 10
 
-O script gera um CSV com 12 colunas, **uma linha por tentativa**:
-
-```
-id_exemplo | tentativa_numero | db_id | pergunta_usuario | query_ouro_spider | query_agente_tentativa | veredito_critico | feedback_critico_recebido | similarity_score_sql | resultado_exato_match | ...
-1          | 1                | concert_singer | How many singers? | SELECT ... | SELECT ... | reprovado | Table "singers" does not... | 0.85   | -
-1          | 2                | concert_singer | How many singers? | SELECT ... | SELECT ... | aprovado  | Aprovado | 1.00 | True
+# Com databse específico
+python scripts/test_spider_eval.py --db-filter concert_singer --sample-size 50
 ```
 
-E um resumo final:
-
-```
-Total de perguntas: 10
-Total de tentativas: 12
-Perguntas aprovadas: 8
-Taxa de aprovação: 80%
-Taxa de sucesso na 1ª tentativa: 60%
-Tentativas médias por pergunta: 1.2
-Similarity score médio: 0.92
-```
-
-### Interpretação
-
-- **id_exemplo**: ID da pergunta (mesmo para todas tentativas dela)
-- **tentativa_numero**: 1ª, 2ª, 3ª tentativa...
-- **veredito_critico**: Aprovado/Reprovado/Erro naquela tentativa
-- **feedback_critico**: Motivo da reprovação (útil para debug)
-- **similarity_score_sql**: 0-1, quanto a query do agente se parece com a ouro
-- **resultado_exato_match**: Se o resultado executado foi exatamente igual
-
-**Análise típica**:
-- Taxa de 1ª tentativa baixa? → Agente está gerando queries incorretas inicialmente
-- Similarity score alto mas veredito reprovado? → Queries sintaticamente parecidas mas semanticamente diferentes
-- Muitas tentativas? → Crítico não está dando feedback útil ou agente não aprende com feedback
+**Parâmetros completose exemplos em [`src/spider/BENCHMARK.md`](src/spider/BENCHMARK.md)**
 
 ## Estrutura
 
diff --git a/reports/spider_eval_2026-04-11_08-44-42.csv b/reports/spider_eval_2026-04-11_08-44-42.csv
new file mode 100644
index 0000000..3267386
--- /dev/null
+++ b/reports/spider_eval_2026-04-11_08-44-42.csv
@@ -0,0 +1,2 @@
+id_exemplo,tentativa_numero,db_id,pergunta_usuario,query_ouro_spider,query_agente_tentativa,tempo_agente_ms,veredito_critico,feedback_critico_recebido,erro_execucao,resultado_exato_match,similarity_score_sql
+1,1,flight_2,Give the code of the airport with the least flights.,SELECT T1.AirportCode FROM AIRPORTS AS T1 JOIN FLIGHTS AS T2 ON T1.AirportCode  =  T2.DestAirport OR T1.AirportCode  =  T2.SourceAirport GROUP BY T1.AirportCode ORDER BY count(*) LIMIT 1,SELECT AirportCode FROM (SELECT SourceAirport AS AirportCode FROM flights UNION ALL SELECT DestAirport AS AirportCode FROM flights) AS all_airports GROUP BY AirportCode ORDER BY COUNT(*) ASC LIMIT 1,11680.65,aprovado,"A consulta SQL responde corretamente à pergunta do usuário ao combinar aeroportos de origem e destino, agrupá-los e contar o número total de voos para cada um. O uso de `UNION ALL` é apropriado para garantir que todos os voos sejam contados, e o `ORDER BY COUNT(*) ASC LIMIT 1` identifica o aeroporto com o menor número de voos. Os resultados fazem sentido e não há erros lógicos.",,False,0.4684
diff --git a/src/spider/BENCHMARK.md b/src/spider/BENCHMARK.md
new file mode 100644
index 0000000..6bcfbc9
--- /dev/null
+++ b/src/spider/BENCHMARK.md
@@ -0,0 +1,173 @@
+# Spider Benchmark - Documentação Técnica
+
+## Visão Geral
+
+Módulo de avaliação automatizada do agente Text-to-Insight contra o **Spider Dataset** (1.034 perguntas em SQL, 20 bancos diferentes). Rastreia cada tentativa individualmente gerando métricas de qualidade.
+
+## Arquitetura
+
+```
+scripts/test_spider_eval.py (Orquestrador)
+    ├── data_loader.py       (Carrega dev.json)
+    ├── query_executor.py    (Executa SQL)
+    ├── metrics.py           (Calcula similarity/match)
+    └── csv_reporter.py      (Salva results e resumo)
+```
+
+## Módulos
+
+### `src/spider/data_loader.py`
+Gerencia dataset Spider (1.034 exemplos JSON).
+
+**Funções principais:**
+- `load_spider_dev_examples(data_dir)` → Lê dev.json, retorna lista de dicts {question, query, db_id}
+- `sample_examples(examples, sample_size, seed)` → Amostra reproducível com seed
+- `filter_by_db_id(examples, db_id)` → Filtra pergunta de um único banco (ex: concert_singer)
+- `get_unique_db_ids(examples)` → Retorna 20 db_ids únicos
+
+---
+
+### `src/spider/query_executor.py`
+Executa queries SQL contra bancos SQLite do Spider.
+
+**Classe: `SpiderQueryExecutor`**
+- `execute_query(db_id, sql)` → Executa query, retorna {success, results, row_count, error, time_ms}
+- `get_db_path(db_id)` → Resolve caminho `/data/spider_data/spider_data/database/{db_id}/{db_id}.sqlite`
+- Usa SQLite em modo **read-only** (`?mode=ro&uri=true`)
+
+**Por que isolado:** Abstrai detalhes de banco de dados, facilita testar com outro driver se necessário.
+
+---
+
+### `src/spider/metrics.py`
+Compara queries geradas vs. queries ouro (baseline).
+
+**Funções principais:**
+- `sql_similarity_score(sql1, sql2)` → Valor 0-1 usando difflib.SequenceMatcher (normaliza UPPER/whitespace/comments)
+- `results_exact_match(results1, results2)` → bool, compara linhas executadas normalizando tipos (NULL → None)
+- `normalize_sql(sql)` → Transforma para comparação (rm whitespace, comments, UPPER)
+- `build_comparison_row(id_exemplo, tentativa_numero, ...)` → Monta dict com 12 colunas para CSV
+
+**Por que isolado:** Reutilizável em testes/análises extras, lógica de comparação centralizada.
+
+---
+
+### `src/spider/csv_reporter.py`
+Gerencia saída CSV e estatísticas agregadas.
+
+**Classe: `CSVReporter`**
+- `__init__(filepath)` → Cria CSV com 12 headers (id_exemplo, tentativa_numero, db_id, pergunta, query_ouro, query_agente, tempo_ms, veredito, feedback, similarity_score, resultado_match, erro)
+- `append_row(row_dict)` → Adiciona 1 linha por tentativa
+- `generate_summary(rows)` → Retorna dict {total_perguntas, taxa_aprovacao, similarity_media, tentativas_media, ...}
+- `generate_timestamped_filename(prefix)` → Returns `spider_eval_2026-04-11_15-30-42.csv`
+
+**Por que isolado:** Padrão CSV fixo, resumo automático, reutilizável em análises.
+
+---
+
+### `scripts/test_spider_eval.py`
+**O maestro do pipeline.** Orquestra todo o benchmark.
+
+**Fluxo:**
+1. **Parse args** → sample-size, seed, db-filter, output, max-attempts, data-dir
+2. **Load dados** → data_loader carrega e filtra exemplos
+3. **Para cada pergunta:**
+   - Executar query ouro (baseline via query_executor)
+   - Invocar grafo LangGraph com estado inicial
+   - **Rastrear stream()** do grafo acumulando estado (`full_estado.update()`)
+   - Quando nó crítico retorna:
+     - Extrair sql_gerada, veredito, feedback do full_estado
+     - Executar query_agente, calcular similarity/match (via metrics)
+     - Salvar linha no CSV (via csv_reporter)
+     - Se aprovado: next pergunta; se reprovado & tentativas < max: retry automático
+4. **Gerar resumo** → reporter calcula estatísticas finais
+
+**Responsabilidade única:** Não calcula metrics, não executa SQL, não salva CSV. Coordena os módulos.
+
+**Configuração o grafo:**
+```python
+grafo.stream(estado_inicial, config={"recursion_limit": 30})
+```
+- `recursion_limit=30`: Permite planejador iterar até ~3 vezes sem erro de recursão
+
+**State accumulation pattern:**
+```python
+full_estado = estado_inicial.copy()
+for output in grafo.stream(...):
+    for node_name, mudancas in output.items():
+        full_estado.update(mudancas)  # Acumula deltas em estado completo
+```
+Necessário porque `stream()` retorna deltas por nó, não estado total.
+
+---
+
+## Uso
+
+```bash
+# Teste básico: 10 perguntas
+python scripts/test_spider_eval.py
+
+# Parametrizado
+python scripts/test_spider_eval.py \
+  --sample-size 50 \
+  --seed 42 \
+  --db-filter concert_singer \
+  --output reports/eval.csv \
+  --max-attempts 3
+```
+
+## Saída
+
+**CSV:**
+- 1 linha = 1 tentativa (mesma pergunta pode ter 1-3 linhas)
+- 12 colunas: id_exemplo, tentativa_numero, db_id, pergunta, query_ouro, query_agente, tempo_ms, veredito, feedback, similarity_score, resultado_match, erro
+
+**Resumo:**
+- Total de perguntas avaliadas
+- Taxa de aprovação (% respostas corretas)
+- Taxa de sucesso 1ª tentativa (agente acerta de primeira?)
+- Tentativas médias por pergunta
+- Similarity score médio
+- Tempo médio por tentativa
+
+## Exemplo de Output
+
+```
+Total de perguntas: 50
+Total de tentativas: 63
+Perguntas aprovadas: 45
+Taxa de aprovação: 90.0%
+Taxa de sucesso na 1ª tentativa: 72.0%
+Tentativas médias: 1.26
+Similarity score médio: 0.953
+Tempo médio: 12345 ms
+✅ CSV salvo em: reports/spider_eval_2026-04-11_15-30-42.csv
+```
+
+## Estrutura do Estado (EstadoTextToInsight)
+
+Usado por todos os módulos, definido em `src/state.py`:
+```python
+{
+    "pergunta_usuario": str,
+    "db_path": str,
+    "contexto_schema": str,
+    "sql_gerada": str,
+    "linhas_resultado_preview": list,
+    "total_linhas_resultado": int,
+    "erro_execucao": str,
+    "feedback_critico": str,
+    "status": str,  # "aprovado", "reprovado", "erro"
+    "tentativas_loop": int,
+}
+```
+
+## Fluxo de Debug
+
+| Erro | Provável causa | Debug |
+|------|----------------|-------|
+| `FileNotFoundError: dev.json` | Dataset não baixado | `wget` Spider dataset em data/spider_data/spider_data/ |
+| `sqlite3.OperationalError: database is locked` | Mode não read-only | Verificar `query_executor.py`, deve ter `?mode=ro&uri=true` |
+| `RecursionLimitError: Recursion limit of 30` | Grafo entrando em loop infinito | Aumentar `recursion_limit` ou debugar nó que não para |
+| CSV vazio | Estado não acumulando | Verificar `full_estado.update()` no script (state accumulation pattern) |
+

From 5445300168bb0938f2764fa7325c97a77f040a05 Mon Sep 17 00:00:00 2001
From: Caua Sathler <cauasathlerufmg@gmail.com>
Date: Tue, 14 Apr 2026 19:59:50 -0300
Subject: [PATCH 03/10] =?UTF-8?q?Modifica=C3=A7=C3=B5es=20para=20testar=20?=
 =?UTF-8?q?queries=20individuais=20e=20imprimir=20resultados=20das=20queri?=
 =?UTF-8?q?es?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/test_spider_eval.py | 40 +++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/scripts/test_spider_eval.py b/scripts/test_spider_eval.py
index 365f407..5f27916 100644
--- a/scripts/test_spider_eval.py
+++ b/scripts/test_spider_eval.py
@@ -80,6 +80,12 @@ def main():
         help="Diretório com dados do Spider",
     )
 
+    parser.add_argument(
+        "--question-filter",
+        type=str,
+        help="Filtrar por um trecho específico da pergunta em inglês",
+    )
+
     args = parser.parse_args()
 
     # Validar API key
@@ -91,22 +97,30 @@ def main():
     # 1. Carregar dados
     print(f"\n📂 Carregando exemplos do Spider de {args.data_dir}...")
     try:
-        ejemplos = load_spider_dev_examples(args.data_dir)
-        print(f"✓ Carregados {len(ejemplos)} exemplos")
+        exemplos = load_spider_dev_examples(args.data_dir)
+        print(f"✓ Carregados {len(exemplos)} exemplos")
     except FileNotFoundError as e:
         print(f"❌ {e}")
         sys.exit(1)
 
     # 2. Aplicar filtros
     if args.db_filter:
-        ejemplos = filter_by_db_id(ejemplos, args.db_filter)
-        print(f"✓ Filtrados por db_id={args.db_filter}: {len(ejemplos)} exemplos")
-
+        exemplos = filter_by_db_id(exemplos, args.db_filter)
+        print(f"✓ Filtrados por db_id={args.db_filter}: {len(exemplos)} exemplos")
+
+    # --- NOVO TRECHO ADICIONADO ---
+    if args.question_filter:
+        exemplos = [
+            ex for ex in exemplos 
+            if args.question_filter.lower() in ex.get("question", "").lower()
+        ]
+        print(f"✓ Filtrados pela pergunta contendo '{args.question_filter}': {len(exemplos)} exemplos")
+    
     # 3. Fazer sampling
-    ejemplos = sample_examples(ejemplos, sample_size=args.sample_size, seed=args.seed)
+    exemplos = sample_examples(exemplos, sample_size=args.sample_size, seed=args.seed)
     print(
-        f"✓ Selecionados {len(ejemplos)} exemplos (seed={args.seed}, "
-        f"bancos únicos: {len(get_unique_db_ids(ejemplos))})"
+        f"✓ Selecionados {len(exemplos)} exemplos (seed={args.seed}, "
+        f"bancos únicos: {len(get_unique_db_ids(exemplos))})"
     )
 
     # 4. Inicializar componentes
@@ -131,18 +145,18 @@ def main():
     print(f"✓ CSV reporter inicializado: {csv_path}")
 
     # 6. Loop de testes
-    print(f"\n🚀 Iniciando avaliação com {len(ejemplos)} perguntas...\n")
+    print(f"\n🚀 Iniciando avaliação com {len(exemplos)} perguntas...\n")
     print("=" * 100)
 
     all_rows = []
     ex_id = 1
 
-    for idx, ex in enumerate(ejemplos, 1):
+    for idx, ex in enumerate(exemplos, 1):
         pergunta = ex.get("question", "")
         query_ouro = ex.get("query", "")
         db_id = ex.get("db_id", "")
 
-        print(f"\n[{idx}/{len(ejemplos)}] Pergunta: {pergunta[:60]}...")
+        print(f"\n[{idx}/{len(exemplos)}] Pergunta: {pergunta[:60]}...")
         print(f"     DB: {db_id} | Query Ouro: {query_ouro[:50]}...")
 
         # Executar query ouro para obter resultado esperado
@@ -214,6 +228,10 @@ def main():
                         if query_agente and not erro_exec:
                             resultado_agente = executor.execute_query(db_id, query_agente)
                             if resultado_agente["success"]:
+                                # Testes de pensamentos pensantes
+                                print(f"Resultado Ouro: {resultado_ouro["results"][:50]}")
+                                print(f"Resultado Text-to-Insight: {resultado_agente["results"][:50]}")
+
                                 resultado_exato_match = results_exact_match(
                                     resultado_ouro["results"],
                                     resultado_agente["results"],

From e09c667f016c10596460b271297f03a1002bc6d3 Mon Sep 17 00:00:00 2001
From: Caua Sathler <cauasathlerufmg@gmail.com>
Date: Tue, 14 Apr 2026 20:00:24 -0300
Subject: [PATCH 04/10] =?UTF-8?q?Modifica=C3=A7=C3=B5es=20em=20coment?=
 =?UTF-8?q?=C3=A1rios=20e=20no=20gitignore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                  | 5 ++++-
 scripts/test_spider_eval.py | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3f5196b..e4f0843 100644
--- a/.gitignore
+++ b/.gitignore
@@ -142,4 +142,7 @@ config_local.py
 data/
 
 # testes
-/meus_testes/*
\ No newline at end of file
+/meus_testes/*
+
+# reports testes spider
+/reports/
\ No newline at end of file
diff --git a/scripts/test_spider_eval.py b/scripts/test_spider_eval.py
index 5f27916..6b93519 100644
--- a/scripts/test_spider_eval.py
+++ b/scripts/test_spider_eval.py
@@ -80,6 +80,7 @@ def main():
         help="Diretório com dados do Spider",
     )
 
+    # testar queries individualmente
     parser.add_argument(
         "--question-filter",
         type=str,
@@ -228,7 +229,7 @@ def main():
                         if query_agente and not erro_exec:
                             resultado_agente = executor.execute_query(db_id, query_agente)
                             if resultado_agente["success"]:
-                                # Testes de pensamentos pensantes
+                                # Imprimir os resultados das duas queries
                                 print(f"Resultado Ouro: {resultado_ouro["results"][:50]}")
                                 print(f"Resultado Text-to-Insight: {resultado_agente["results"][:50]}")
 

From 79fab43403904eac1dc1755bd07f5509736d0e20 Mon Sep 17 00:00:00 2001
From: Caua Sathler <cauasathlerufmg@gmail.com>
Date: Wed, 15 Apr 2026 10:23:23 -0300
Subject: [PATCH 05/10] =?UTF-8?q?Merge=20com=20a=20HITL=20e=20implementa?=
 =?UTF-8?q?=C3=A7=C3=A3o=20de=20compara=C3=A7=C3=A3o=20de=20resultados=20c?=
 =?UTF-8?q?om=20base=20em=20arrays=20np?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 reports/spider_eval_2026-04-11_08-44-42.csv |  2 -
 scripts/test_spider_eval.py                 | 15 +++-
 src/spider/metrics.py                       | 98 +++++++++++++++------
 3 files changed, 81 insertions(+), 34 deletions(-)
 delete mode 100644 reports/spider_eval_2026-04-11_08-44-42.csv

diff --git a/reports/spider_eval_2026-04-11_08-44-42.csv b/reports/spider_eval_2026-04-11_08-44-42.csv
deleted file mode 100644
index 3267386..0000000
--- a/reports/spider_eval_2026-04-11_08-44-42.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-id_exemplo,tentativa_numero,db_id,pergunta_usuario,query_ouro_spider,query_agente_tentativa,tempo_agente_ms,veredito_critico,feedback_critico_recebido,erro_execucao,resultado_exato_match,similarity_score_sql
-1,1,flight_2,Give the code of the airport with the least flights.,SELECT T1.AirportCode FROM AIRPORTS AS T1 JOIN FLIGHTS AS T2 ON T1.AirportCode  =  T2.DestAirport OR T1.AirportCode  =  T2.SourceAirport GROUP BY T1.AirportCode ORDER BY count(*) LIMIT 1,SELECT AirportCode FROM (SELECT SourceAirport AS AirportCode FROM flights UNION ALL SELECT DestAirport AS AirportCode FROM flights) AS all_airports GROUP BY AirportCode ORDER BY COUNT(*) ASC LIMIT 1,11680.65,aprovado,"A consulta SQL responde corretamente à pergunta do usuário ao combinar aeroportos de origem e destino, agrupá-los e contar o número total de voos para cada um. O uso de `UNION ALL` é apropriado para garantir que todos os voos sejam contados, e o `ORDER BY COUNT(*) ASC LIMIT 1` identifica o aeroporto com o menor número de voos. Os resultados fazem sentido e não há erros lógicos.",,False,0.4684
diff --git a/scripts/test_spider_eval.py b/scripts/test_spider_eval.py
index 6b93519..6fa20fa 100644
--- a/scripts/test_spider_eval.py
+++ b/scripts/test_spider_eval.py
@@ -90,9 +90,12 @@ def main():
     args = parser.parse_args()
 
     # Validar API key
-    api_key = os.getenv("GOOGLE_API_KEY")
+    # model = "gpt-4o-mini"
+    model = "gemini-2.5-flash"
+    
+    api_key = os.getenv("OPENAI_API_KEY") if "gpt" in model.lower() else os.getenv("GOOGLE_API_KEY")
     if not api_key:
-        print("❌ Erro: GOOGLE_API_KEY não encontrada em .env")
+        print("❌ Erro: Chave API não encontrada em .env")
         sys.exit(1)
 
     # 1. Carregar dados
@@ -127,7 +130,7 @@ def main():
     # 4. Inicializar componentes
     print("\n🔧 Inicializando componentes...")
     try:
-        grafo = Graph(api_key)
+        grafo = Graph(model=model, api_key=api_key)
         print("✓ Grafo LangGraph inicializado")
     except Exception as e:
         print(f"❌ Erro ao inicializar grafo: {e}")
@@ -192,7 +195,11 @@ def main():
 
         try:
             for output in grafo.stream(
-                estado_inicial, config={"recursion_limit": 30}
+                estado_inicial, config={"recursion_limit": 30,
+                                        "configurable": {
+                                            "thread_id": f"spider_test_{ex_id}", # config incluindo a thread de memória 
+                                        }
+                                    }
             ):
                 # stream() retorna dict: {'nó_name': {mudanças_do_nó}}
                 # Acumular mudanças no estado completo
diff --git a/src/spider/metrics.py b/src/spider/metrics.py
index 6e82f30..47c7b10 100644
--- a/src/spider/metrics.py
+++ b/src/spider/metrics.py
@@ -10,6 +10,7 @@
 import difflib
 import re
 from typing import Any
+import numpy as np
 
 
 def normalize_sql(sql: str) -> str:
@@ -71,44 +72,85 @@ def sql_similarity_score(sql1: str, sql2: str) -> float:
     return matcher.ratio()
 
 
+# def results_exact_match(
+#     results_gold: list[dict[str, Any]],
+#     results_agent: list[dict[str, Any]],
+# ) -> bool:
+#     """
+#     Compara se dois conjuntos de resultados são exatamente iguais.
+
+#     Compara:
+#     - Número de linhas
+#     - Valores de cada linha (insensível a ordem das colunas)
+
+#     Args:
+#         results_gold: Resultados da query ouro
+#         results_agent: Resultados da query do agente
+
+#     Returns:
+#         True se resultados são iguais
+#     """
+#     if len(results_gold) != len(results_agent):
+#         return False
+
+#     # Converter dicts para conjuntos de tuplas para comparação
+#     # (para serem agnósticos à ordem das colunas)
+#     def result_set(results: list[dict[str, Any]]) -> set:
+#         converted = []
+#         for row in results:
+#             # Converter valores para strings para lidar com tipos diferentes
+#             items = []
+#             for k in sorted(row.keys()):
+#                 # Normalizar None/NULL
+#                 v = row[k]
+#                 if v is None:
+#                     v = "NULL"
+#                 items.append((k, str(v)))
+#             converted.append(tuple(items))
+#         return set(converted)
+
+#     return result_set(results_gold) == result_set(results_agent)
+
 def results_exact_match(
     results_gold: list[dict[str, Any]],
     results_agent: list[dict[str, Any]],
 ) -> bool:
     """
-    Compara se dois conjuntos de resultados são exatamente iguais.
-
-    Compara:
-    - Número de linhas
-    - Valores de cada linha (insensível a ordem das colunas)
-
-    Args:
-        results_gold: Resultados da query ouro
-        results_agent: Resultados da query do agente
-
-    Returns:
-        True se resultados são iguais
+    Compara se dois conjuntos de resultados são iguais baseando-se APENAS nos valores.
+    Ignora os nomes das colunas e a ordem das linhas.
     """
+    # Se não têm o mesmo número de linhas, já é False
     if len(results_gold) != len(results_agent):
         return False
+        
+    # Se as duas listas vierem vazias (0 linhas), é True
+    if not results_gold:
+        return True
 
-    # Converter dicts para conjuntos de tuplas para comparação
-    # (para serem agnósticos à ordem das colunas)
-    def result_set(results: list[dict[str, Any]]) -> set:
-        converted = []
+    def extract_values_to_numpy(results: list[dict[str, Any]]) -> np.ndarray:
+        matrix = []
         for row in results:
-            # Converter valores para strings para lidar com tipos diferentes
-            items = []
-            for k in sorted(row.keys()):
-                # Normalizar None/NULL
-                v = row[k]
-                if v is None:
-                    v = "NULL"
-                items.append((k, str(v)))
-            converted.append(tuple(items))
-        return set(converted)
-
-    return result_set(results_gold) == result_set(results_agent)
+            # Pega APENAS os valores, ignora as chaves
+            # Converte tudo para string (evita falsos negativos entre 0 inteiro e 0.0 float)
+            row_values = [str(v) if v is not None else "NULL" for v in row.values()]
+            matrix.append(row_values)
+            
+        # Converte a matriz nativa do Python para um Array NumPy
+        arr = np.array(matrix)
+        
+        # Como as queries podem retornar as linhas em ordens diferentes (se não houver ORDER BY),
+        # precisamos ordenar as linhas do array numpy lexograficamente para uma comparação justa.
+        # np.lexsort ordena pelas colunas, da última para a primeira, então passamos transposto e invertido
+        sorted_indices = np.lexsort(arr.T[::-1])
+        return arr[sorted_indices]
+
+    # Extrai, processa e ordena os arrays
+    gold_array = extract_values_to_numpy(results_gold)
+    agent_array = extract_values_to_numpy(results_agent)
+
+    # np.array_equal compara a estrutura (dimensões) e o conteúdo.
+    # Usamos bool() para garantir que retorne um booleano nativo do Python e não um np.bool_
+    return bool(np.array_equal(gold_array, agent_array))
 
 
 def build_comparison_row(

From 24aa3261f85bda4595dfb23fa052e94cbc1adf10 Mon Sep 17 00:00:00 2001
From: JonasMelo21 <jonashonorato4@gmail.com>
Date: Thu, 23 Apr 2026 14:44:56 -0300
Subject: [PATCH 06/10] benchmark com spider comparando resultados

---
 reports/meu_teste.csv | 4 ----
 requirements.txt      | 3 ++-
 2 files changed, 2 insertions(+), 5 deletions(-)
 delete mode 100644 reports/meu_teste.csv

diff --git a/reports/meu_teste.csv b/reports/meu_teste.csv
deleted file mode 100644
index 61890b3..0000000
--- a/reports/meu_teste.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-id_exemplo,tentativa_numero,db_id,pergunta_usuario,query_ouro_spider,query_agente_tentativa,tempo_agente_ms,veredito_critico,feedback_critico_recebido,erro_execucao,resultado_exato_match,similarity_score_sql
-1,1,concert_singer,What is the name and country of origin of every singer who has a song with the word 'Hey' in its title?,"SELECT name ,  country FROM singer WHERE song_name LIKE '%Hey%'","SELECT Name, Country FROM singer WHERE Song_Name LIKE '%Hey%'",12383.0,reprovado,"A consulta SQL assume que a coluna `Song_Name` existe diretamente na tabela `singer`, o que é uma estrutura de banco de dados incomum para representar múltiplas músicas de um cantor. Para responder adequadamente à pergunta, que implica que um cantor pode ter várias músicas, seria necessário fazer um `JOIN` entre as tabelas `singer` e `song` (assumindo uma tabela `song` separada). A query gerada pode falhar em encontrar todos os cantores relevantes se a coluna `Song_Name` na tabela `singer` não representar todas as músicas do cantor ou se não existir.",,True,0.9919
-1,2,concert_singer,What is the name and country of origin of every singer who has a song with the word 'Hey' in its title?,"SELECT name ,  country FROM singer WHERE song_name LIKE '%Hey%'","SELECT Name, Country FROM singer WHERE Song_Name LIKE '%Hey%'",29157.91,reprovado,"A consulta SQL assume que a coluna `Song_Name` existe diretamente na tabela `singer`. Em um esquema de banco de dados relacional típico, as músicas estariam em uma tabela separada (`song`) e ligadas aos cantores por uma chave estrangeira, exigindo uma operação `JOIN`. A consulta gerada não reflete essa estrutura comum e, portanto, não responde adequadamente à pergunta que implica uma relação entre cantores e músicas.",,True,0.9919
-1,3,concert_singer,What is the name and country of origin of every singer who has a song with the word 'Hey' in its title?,"SELECT name ,  country FROM singer WHERE song_name LIKE '%Hey%'","SELECT DISTINCT Name, Country FROM singer WHERE Song_Name LIKE '%Hey%'",65847.0,aprovado,"A consulta SQL responde corretamente à pergunta do usuário, selecionando o nome e o país dos cantores que possuem músicas com 'Hey' no título e utilizando `DISTINCT` para evitar duplicatas. Os resultados fazem sentido e a consulta foi executada com sucesso, indicando que a coluna `Song_Name` existe na tabela `singer` conforme a interpretação da IA.",,True,0.9242
diff --git a/requirements.txt b/requirements.txt
index 30663a7..c16caaf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ langchain-google-genai>=2.0.0
 langchain-openai
 pytest>=9.0.2
 pytest-recording>=0.13.0
-pytest-timeout>=2.3.0
\ No newline at end of file
+pytest-timeout>=2.3.0
+numpy
\ No newline at end of file

From c5c26d1953012fdbb0b8218dff624b8ddf558eb7 Mon Sep 17 00:00:00 2001
From: Petroncini <caiopetroncini@gmail.com>
Date: Fri, 1 May 2026 17:12:17 -0300
Subject: [PATCH 07/10] =?UTF-8?q?modifiquei=20script=20de=20avalia=C3=A7?=
 =?UTF-8?q?=C3=A3o=20para=20usar=20a=20classe=20InsightEngine=20e=20modifi?=
 =?UTF-8?q?quei=20o=20prompt=20do=20cr=C3=ADtico=20para=20ser=20menos=20cr?=
 =?UTF-8?q?=C3=ADtico?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/test_spider_eval.py     | 424 +++++++++++++++++++++++---------
 text_to_insight/nodes/critic.py |  26 ++
 2 files changed, 328 insertions(+), 122 deletions(-)

diff --git a/scripts/test_spider_eval.py b/scripts/test_spider_eval.py
index 6fa20fa..e8415f2 100644
--- a/scripts/test_spider_eval.py
+++ b/scripts/test_spider_eval.py
@@ -3,7 +3,7 @@
 Script de Avaliação do Agente contra Spider Dataset.
 
 Testa o agente Text-to-Insight contra perguntas reais do Spider dataset,
-rastreando cada tentativa (quando o crítico reprova e volta ao planejador).
+usando a classe InsightEngine do pacote text_to_insight.
 
 Uso:
     python scripts/test_spider_eval.py --sample-size 10 --seed 42
@@ -14,6 +14,7 @@
 import os
 import sys
 import time
+from datetime import datetime
 from pathlib import Path
 
 # Add project root to path
@@ -21,8 +22,9 @@
 
 from dotenv import load_dotenv
 
-# Importar módulos do projeto
-from src.graph import Graph
+# Importar InsightEngine do pacote text_to_insight
+from text_to_insight import InsightEngine
+
 from src.spider.csv_reporter import CSVReporter
 from src.spider.data_loader import (
     filter_by_db_id,
@@ -33,6 +35,7 @@
 from src.spider.metrics import (
     build_comparison_row,
     results_exact_match,
+    results_f1_score,
     sql_similarity_score,
 )
 from src.spider.query_executor import SpiderQueryExecutor
@@ -40,6 +43,147 @@
 load_dotenv()
 
 
+def _gerar_relatorio_md(
+    report_path: str,
+    summary: dict,
+    f1_medio: float,
+    exact_match_rate: float,
+    all_rows: list[dict],
+    mismatches: list[dict],
+    model: str,
+    sample_size: int,
+    seed: int,
+    data_dir: str,
+) -> None:
+    """Gera um relatório textual em Markdown com estatísticas e detalhes de mismatches."""
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    Path(report_path).parent.mkdir(parents=True, exist_ok=True)
+
+    lines = []
+    lines.append("# Spider Evaluation Report")
+    lines.append("")
+    lines.append(f"**Gerado em:** {timestamp}")
+    lines.append("")
+
+    # --- Configuração ---
+    lines.append("## Configuração")
+    lines.append("")
+    lines.append(f"| Parâmetro | Valor |")
+    lines.append(f"|-----------|-------|")
+    lines.append(f"| Modelo | `{model}` |")
+    lines.append(f"| Sample size | {sample_size} |")
+    lines.append(f"| Seed | {seed} |")
+    lines.append(f"| Data dir | `{data_dir}` |")
+    lines.append("")
+
+    # --- Resumo ---
+    lines.append("## Resumo")
+    lines.append("")
+    lines.append(f"| Métrica | Valor |")
+    lines.append(f"|---------|-------|")
+    lines.append(f"| Total de perguntas | {summary['total_perguntas']} |")
+    lines.append(f"| Total de tentativas | {summary['total_tentativas']} |")
+    lines.append(f"| Perguntas aprovadas (crítico) | {summary['perguntas_aprovadas']} |")
+    lines.append(f"| Taxa de aprovação | {summary['taxa_aprovacao']:.1%} |")
+    lines.append(f"| Taxa de sucesso na 1ª tentativa | {summary['taxa_1a_tentativa']:.1%} |")
+    lines.append(f"| Tentativas médias por pergunta | {summary['tentativas_media']:.2f} |")
+    lines.append(f"| Similarity score médio (SQL) | {summary['similarity_media']:.4f} |")
+    lines.append(f"| F1 score médio (resultados) | {f1_medio:.4f} |")
+    lines.append(f"| Exact match rate | {exact_match_rate:.1%} |")
+    lines.append(f"| Mismatches | {len(mismatches)}/{len(all_rows)} |")
+    lines.append(f"| Tempo médio por tentativa | {summary['tempo_medio_ms']:.0f} ms |")
+    lines.append("")
+
+    # --- Tabela por pergunta ---
+    lines.append("## Resultados por Pergunta")
+    lines.append("")
+    lines.append("| # | DB | Pergunta | Match | F1 | Similarity | Veredito |")
+    lines.append("|---|-----|----------|-------|----|------------|----------|")
+    for r in all_rows:
+        pergunta_curta = str(r['pergunta_usuario'])[:50]
+        match_icon = "✅" if r['resultado_exato_match'] is True else ("❌" if r['resultado_exato_match'] is False else "⚠️")
+        lines.append(
+            f"| {r['id_exemplo']} "
+            f"| {r['db_id']} "
+            f"| {pergunta_curta}... "
+            f"| {match_icon} "
+            f"| {r.get('resultado_f1', 0):.2f} "
+            f"| {r['similarity_score_sql']:.2f} "
+            f"| {r['veredito_critico']} |"
+        )
+    lines.append("")
+
+    # --- Detalhes dos mismatches ---
+    if mismatches:
+        lines.append("## Detalhes dos Mismatches")
+        lines.append("")
+        lines.append(f"Total: **{len(mismatches)}** perguntas não obtiveram exact match.")
+        lines.append("")
+
+        for i, m in enumerate(mismatches, 1):
+            lines.append(f"### Mismatch {i} — Pergunta #{m['id']} (`{m['db_id']}`)")
+            lines.append("")
+            lines.append(f"**Pergunta:** {m['pergunta']}")
+            lines.append("")
+            lines.append(f"**F1:** {m['f1']:.4f} | **Precision:** {m['precision']:.4f} | **Recall:** {m['recall']:.4f}")
+            lines.append("")
+
+            # SQL comparison
+            lines.append("**Query Ouro (Spider):**")
+            lines.append(f"```sql")
+            lines.append(m['query_ouro'])
+            lines.append(f"```")
+            lines.append("")
+            lines.append("**Query Agente:**")
+            lines.append(f"```sql")
+            lines.append(m['query_agente'])
+            lines.append(f"```")
+            lines.append("")
+
+            # Result comparison (show up to 20 rows each)
+            lines.append("**Resultado Ouro** (primeiras 20 linhas):")
+            lines.append("")
+            ouro_sample = m['resultado_ouro'][:20]
+            if ouro_sample:
+                cols = list(ouro_sample[0].keys())
+                lines.append("| " + " | ".join(cols) + " |")
+                lines.append("| " + " | ".join(["---"] * len(cols)) + " |")
+                for row in ouro_sample:
+                    vals = [str(row.get(c, "")) for c in cols]
+                    lines.append("| " + " | ".join(vals) + " |")
+                if len(m['resultado_ouro']) > 20:
+                    lines.append(f"*... e mais {len(m['resultado_ouro']) - 20} linhas*")
+            else:
+                lines.append("*(vazio)*")
+            lines.append("")
+
+            lines.append("**Resultado Agente** (primeiras 20 linhas):")
+            lines.append("")
+            agent_sample = m['resultado_agente'][:20]
+            if agent_sample:
+                cols = list(agent_sample[0].keys())
+                lines.append("| " + " | ".join(cols) + " |")
+                lines.append("| " + " | ".join(["---"] * len(cols)) + " |")
+                for row in agent_sample:
+                    vals = [str(row.get(c, "")) for c in cols]
+                    lines.append("| " + " | ".join(vals) + " |")
+                if len(m['resultado_agente']) > 20:
+                    lines.append(f"*... e mais {len(m['resultado_agente']) - 20} linhas*")
+            else:
+                lines.append("*(vazio)*")
+            lines.append("")
+            lines.append("---")
+            lines.append("")
+    else:
+        lines.append("## Detalhes dos Mismatches")
+        lines.append("")
+        lines.append("🎉 **Nenhum mismatch!** Todos os resultados foram exact match.")
+        lines.append("")
+
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+
 def main():
     """Main entry point."""
     parser = argparse.ArgumentParser(
@@ -90,8 +234,8 @@ def main():
     args = parser.parse_args()
 
     # Validar API key
-    # model = "gpt-4o-mini"
-    model = "gemini-2.5-flash"
+    model = "gpt-4o-mini"
+    # model = "gemini-2.5-flash"
     
     api_key = os.getenv("OPENAI_API_KEY") if "gpt" in model.lower() else os.getenv("GOOGLE_API_KEY")
     if not api_key:
@@ -129,14 +273,8 @@ def main():
 
     # 4. Inicializar componentes
     print("\n🔧 Inicializando componentes...")
-    try:
-        grafo = Graph(model=model, api_key=api_key)
-        print("✓ Grafo LangGraph inicializado")
-    except Exception as e:
-        print(f"❌ Erro ao inicializar grafo: {e}")
-        sys.exit(1)
 
-    executor = SpiderQueryExecutor()
+    executor = SpiderQueryExecutor(database_dir=str(Path(args.data_dir) / "database"))
     print("✓ Query executor inicializado")
 
     # 5. Preparar CSV
@@ -153,8 +291,12 @@ def main():
     print("=" * 100)
 
     all_rows = []
+    mismatches = []  # Coletar detalhes dos casos que não bateram
     ex_id = 1
 
+    # Cache de InsightEngine por db_id para evitar recompilação do grafo
+    engine_cache: dict[str, InsightEngine] = {}
+
     for idx, ex in enumerate(exemplos, 1):
         pergunta = ex.get("question", "")
         query_ouro = ex.get("query", "")
@@ -173,123 +315,134 @@ def main():
 
         print(f"     ✓ Query ouro retornou {resultado_ouro['row_count']} linhas")
 
-        # Invocar grafo com stream() para rastrear tentativas
-        tentativa_numero = 1
-        estado_inicial = {
-            "pergunta_usuario": pergunta,
-            "db_path": str(executor.get_db_path(db_id)),
-            "contexto_schema": "",
-            "sql_gerada": "",
-            "linhas_resultado_preview": [],
-            "total_linhas_resultado": 0,
-            "erro_execucao": "",
-            "saida_terminal": "",
-            "feedback_critico": "",
-            "status": "iniciado",
-            "tentativas_loop": 0,
-        }
-
-        print(f"     → Invocando agente (max tentativas: {args.max_attempts})...")
+        # Obter ou criar InsightEngine para este db_id
+        db_path = str(executor.get_db_path(db_id))
+        if db_id not in engine_cache:
+            try:
+                engine_cache[db_id] = InsightEngine(
+                    api_key=api_key,
+                    model=model,
+                    db_path=db_path,
+                    hitl=False,
+                    show_output=False,
+                )
+                print(f"     ✓ InsightEngine inicializado para db={db_id}")
+            except Exception as e:
+                print(f"     ❌ Erro ao inicializar InsightEngine: {e}")
+                continue
+
+        engine = engine_cache[db_id]
+
+        # Invocar agente via InsightEngine.run()
+        print(f"     → Invocando agente via InsightEngine...")
         inicio_agente = time.time()
-        full_estado = estado_inicial.copy()  # Manter estado acumulado
 
         try:
-            for output in grafo.stream(
-                estado_inicial, config={"recursion_limit": 30,
-                                        "configurable": {
-                                            "thread_id": f"spider_test_{ex_id}", # config incluindo a thread de memória 
-                                        }
-                                    }
-            ):
-                # stream() retorna dict: {'nó_name': {mudanças_do_nó}}
-                # Acumular mudanças no estado completo
-                for node_name, mudancas in output.items():
-                    full_estado.update(mudancas)
-                    
-                    # Após crítico retornar, coletar métricas e salvar
-                    if "critic" in node_name.lower():
-                        tempo_tentativa = (time.time() - inicio_agente) * 1000
-
-                        query_agente = full_estado.get("sql_gerada", "")
-                        veredito = full_estado.get("status", "")
-                        feedback_estado = full_estado.get("feedback_critico", "")
-                        erro_exec = full_estado.get("erro_execucao", "")
-                        tentativas = full_estado.get("tentativas_loop", 1)
-
-                        # Mapear status para veredito e definir feedback
-                        if veredito == "aprovado":
-                            veredito_critico = "aprovado"
-                            # Se aprovado, feedback é confirmação
-                            feedback_critico = feedback_estado if feedback_estado else "Aprovado"
-                        elif veredito == "reprovado":
-                            veredito_critico = "reprovado"
-                            # Se reprovado, usar feedback do crítico
-                            feedback_critico = feedback_estado if feedback_estado else "Reprovado pelo crítico"
-                        else:
-                            veredito_critico = "erro"
-                            feedback_critico = feedback_estado if feedback_estado else "Erro na avaliação"
-
-                        # Comparar resultados se query agente foi gerada
-                        resultado_exato_match = None
-                        similarity_score = 0.0
-
-                        if query_agente and not erro_exec:
-                            resultado_agente = executor.execute_query(db_id, query_agente)
-                            if resultado_agente["success"]:
-                                # Imprimir os resultados das duas queries
-                                print(f"Resultado Ouro: {resultado_ouro["results"][:50]}")
-                                print(f"Resultado Text-to-Insight: {resultado_agente["results"][:50]}")
-
-                                resultado_exato_match = results_exact_match(
-                                    resultado_ouro["results"],
-                                    resultado_agente["results"],
-                                )
-                                similarity_score = sql_similarity_score(query_ouro, query_agente)
-                                print(
-                                    f"       Tentativa {tentativas}: "
-                                    f"similarity={similarity_score:.2f}, "
-                                    f"match={resultado_exato_match}, "
-                                    f"veredito={veredito_critico}"
-                                )
-                            else:
-                                erro_exec = resultado_agente["error"]
-                        else:
-                            print(
-                                f"       Tentativa {tentativas}: "
-                                f"sem query gerada ou com erro de execução"
-                            )
-
-                        # Construir linha para CSV
-                        row = build_comparison_row(
-                            id_exemplo=ex_id,
-                            tentativa_numero=tentativas,
-                            db_id=db_id,
-                            pergunta=pergunta,
-                            query_ouro=query_ouro,
-                            query_agente=query_agente,
-                            tempo_agente_ms=tempo_tentativa,
-                            veredito_critico=veredito_critico,
-                            feedback_critico=feedback_critico,
-                            erro_execucao=erro_exec,
-                            resultado_exato_match=resultado_exato_match,
-                            similarity_score=similarity_score,
-                        )
-
-                        reporter.append_row(row)
-                        all_rows.append(row)
-
-                        # Se aprovado, terminar loop
-                        if veredito_critico == "aprovado":
-                            print(f"     ✅ APROVADO na tentativa {tentativas}")
-                            break
-                        elif tentativas >= args.max_attempts:
-                            print(f"     ❌ MÁXIMO DE TENTATIVAS ({args.max_attempts}) ATINGIDO")
-                            break
-
+            resultado = engine.run(
+                thread_id=f"spider_test_{ex_id}",
+                query=pergunta,
+            )
         except Exception as e:
             print(f"     ⚠️  Erro ao processar pergunta: {str(e)}")
             continue
 
+        tempo_total = (time.time() - inicio_agente) * 1000
+
+        # Extrair dados do resultado
+        query_agente = resultado.get("sql_gerada", "")
+        veredito = resultado.get("status", "")
+        feedback_estado = resultado.get("feedback_critico", "")
+        erro_exec = resultado.get("erro_execucao", "")
+        tentativas = resultado.get("tentativas_loop", 1)
+
+        # Mapear status para veredito e definir feedback
+        if veredito == "aprovado":
+            veredito_critico = "aprovado"
+            feedback_critico = feedback_estado if feedback_estado else "Aprovado"
+        elif veredito == "reprovado":
+            veredito_critico = "reprovado"
+            feedback_critico = feedback_estado if feedback_estado else "Reprovado pelo crítico"
+        else:
+            veredito_critico = "erro"
+            feedback_critico = feedback_estado if feedback_estado else "Erro na avaliação"
+
+        # Comparar resultados se query agente foi gerada
+        resultado_exato_match = None
+        similarity_score = 0.0
+        f1_scores = {"f1": 0.0, "precision": 0.0, "recall": 0.0}
+
+        if query_agente and not erro_exec:
+            resultado_agente = executor.execute_query(db_id, query_agente)
+            if resultado_agente["success"]:
+                # Imprimir os resultados das duas queries
+                print(f"Resultado Ouro: {resultado_ouro['results'][:50]}")
+                print(f"Resultado Text-to-Insight: {resultado_agente['results'][:50]}")
+
+                resultado_exato_match = results_exact_match(
+                    resultado_ouro["results"],
+                    resultado_agente["results"],
+                )
+                similarity_score = sql_similarity_score(query_ouro, query_agente)
+                f1_scores = results_f1_score(
+                    resultado_ouro["results"],
+                    resultado_agente["results"],
+                )
+                print(
+                    f"       Resultado final ({tentativas} tentativa(s)): "
+                    f"similarity={similarity_score:.2f}, "
+                    f"match={resultado_exato_match}, "
+                    f"F1={f1_scores['f1']:.2f}, "
+                    f"veredito={veredito_critico}"
+                )
+                # Coletar detalhes dos mismatches
+                if not resultado_exato_match:
+                    mismatches.append({
+                        "id": ex_id,
+                        "db_id": db_id,
+                        "pergunta": pergunta,
+                        "query_ouro": query_ouro,
+                        "query_agente": query_agente,
+                        "resultado_ouro": resultado_ouro["results"],
+                        "resultado_agente": resultado_agente["results"],
+                        "f1": f1_scores["f1"],
+                        "precision": f1_scores["precision"],
+                        "recall": f1_scores["recall"],
+                    })
+            else:
+                erro_exec = resultado_agente["error"]
+        else:
+            print(
+                f"       Resultado final ({tentativas} tentativa(s)): "
+                f"sem query gerada ou com erro de execução"
+            )
+
+        # Construir linha para CSV
+        row = build_comparison_row(
+            id_exemplo=ex_id,
+            tentativa_numero=tentativas,
+            db_id=db_id,
+            pergunta=pergunta,
+            query_ouro=query_ouro,
+            query_agente=query_agente,
+            tempo_agente_ms=tempo_total,
+            veredito_critico=veredito_critico,
+            feedback_critico=feedback_critico,
+            erro_execucao=erro_exec,
+            resultado_exato_match=resultado_exato_match,
+            similarity_score=similarity_score,
+            resultado_f1=f1_scores["f1"],
+            resultado_precision=f1_scores["precision"],
+            resultado_recall=f1_scores["recall"],
+        )
+
+        reporter.append_row(row)
+        all_rows.append(row)
+
+        if veredito_critico == "aprovado":
+            print(f"     ✅ APROVADO após {tentativas} tentativa(s)")
+        else:
+            print(f"     ❌ NÃO APROVADO após {tentativas} tentativa(s)")
+
         ex_id += 1
         time.sleep(1)  # Delay entre perguntas
 
@@ -300,6 +453,14 @@ def main():
 
     if all_rows:
         summary = reporter.generate_summary(all_rows)
+        # Calcular F1 médio
+        f1_values = [float(r.get("resultado_f1", 0)) for r in all_rows if r.get("resultado_f1")]
+        f1_medio = sum(f1_values) / len(f1_values) if f1_values else 0.0
+        # Calcular exact match rate
+        match_values = [r.get("resultado_exato_match") for r in all_rows]
+        exact_matches = sum(1 for v in match_values if v is True)
+        exact_match_rate = exact_matches / len(all_rows) if all_rows else 0.0
+
         print(f"Total de perguntas: {summary['total_perguntas']}")
         print(f"Total de tentativas: {summary['total_tentativas']}")
         print(f"Perguntas aprovadas: {summary['perguntas_aprovadas']}")
@@ -307,8 +468,27 @@ def main():
         print(f"Taxa de sucesso na 1ª tentativa: {summary['taxa_1a_tentativa']:.1%}")
         print(f"Tentativas médias por pergunta: {summary['tentativas_media']:.2f}")
         print(f"Similarity score médio: {summary['similarity_media']:.4f}")
+        print(f"F1 score médio (resultados): {f1_medio:.4f}")
+        print(f"Exact match rate: {exact_match_rate:.1%}")
+        print(f"Mismatches: {len(mismatches)}/{len(all_rows)}")
         print(f"Tempo médio por tentativa: {summary['tempo_medio_ms']:.2f} ms")
         print(f"\n✅ CSV salvo em: {csv_path}")
+
+        # 8. Gerar relatório textual em Markdown
+        report_path = csv_path.replace(".csv", "_report.md")
+        _gerar_relatorio_md(
+            report_path=report_path,
+            summary=summary,
+            f1_medio=f1_medio,
+            exact_match_rate=exact_match_rate,
+            all_rows=all_rows,
+            mismatches=mismatches,
+            model=model,
+            sample_size=args.sample_size,
+            seed=args.seed,
+            data_dir=args.data_dir,
+        )
+        print(f"✅ Relatório salvo em: {report_path}")
     else:
         print("❌ Nenhum resultado para salvar")
 
diff --git a/text_to_insight/nodes/critic.py b/text_to_insight/nodes/critic.py
index 04e53c7..af1bfbf 100644
--- a/text_to_insight/nodes/critic.py
+++ b/text_to_insight/nodes/critic.py
@@ -38,6 +38,32 @@
 2. Os resultados fazem sentido?
 3. Há algum erro lógico ou de interpretação?
 
+Ao avaliar, priorize utilidade prática e correção semântica da resposta,
+não perfeição formal.
+
+Diferenças de formato, representação ou precisão que não alterem
+substancialmente a resposta NÃO devem causar reprovação.
+
+Exemplos de casos que normalmente devem ser APROVADOS:
+- Ano médio retornado como float em vez de inteiro/data
+- Pequenas diferenças de arredondamento
+- Colunas extras irrelevantes
+- Nomes/aliases diferentes
+- Resultado parcialmente correto mas ainda útil
+- Agregações corretas com precisão numérica diferente da esperada
+
+REPROVE apenas quando houver falha material, por exemplo:
+- A query responde outra pergunta
+- O dado necessário para responder não está presentes
+- Filtros importantes estão errados ou ausentes
+- JOIN incorreto altera significativamente os resultados
+- Métrica errada (SUM vs AVG, COUNT vs COUNT DISTINCT, etc.)
+- Resultado vazio inesperado
+- Erro SQL ou inconsistência lógica grave
+
+Considere o custo de retentativas. Em caso de dúvida entre APROVADO
+e REPROVADO, prefira APROVADO se a resposta ainda for útil para o usuário. Leve em consideração que ainda tem um agente depois de você que irá interpretar o resultado da query e criar uma resposta em linguagem natural.
+
 Responda no formato:
 VEREDITO: APROVADO ou REPROVADO
 FEEDBACK: <sua avaliação em 1-3 frases>"""

From 2a5be5c49f6bc72b3268ee8c6a5862a5ba021d92 Mon Sep 17 00:00:00 2001
From: JonasMelo21 <jonashonorato4@gmail.com>
Date: Sun, 3 May 2026 19:23:06 -0300
Subject: [PATCH 08/10] =?UTF-8?q?feat:=20mostrar=20resultado=20da=20query?=
 =?UTF-8?q?=20na=20sa=C3=ADda=20do=20terminal?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ARQUITETURA.md                               |  6 ++
 DESENVOLVIMENTO.md                           |  6 ++
 README.md                                    | 21 ++++++
 requirements.txt                             |  4 +-
 text_to_insight/nodes/code_agent/code_sql.py |  7 +-
 text_to_insight/nodes/sandbox.py             |  2 +
 text_to_insight/runtime.py                   | 74 +++++++++++++++++---
 7 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/ARQUITETURA.md b/ARQUITETURA.md
index 3226d33..1e9a6cc 100644
--- a/ARQUITETURA.md
+++ b/ARQUITETURA.md
@@ -53,6 +53,12 @@ Responsavel por:
 - persistir metricas em CSV;
 - exibir resultado final em formato padrao.
 
+Funcoes principais:
+
+- `_montar_saida_resultado_terminal(resultado)`: template reutilizavel que transforma linhas brutas da query em texto formatado para o terminal, usando `tabulate` para renderizar a tabela. Suporta fallback para amostra quando resultado completo nao estiver disponivel.
+- `salvar_resultado_csv(resultado, pasta)`: exporta o resultado completo em CSV com timestamp em `results/`.
+- `exibir_resultado_console(resultado)`: orquestra a exibicao completa do resultado: SQL, saida da execucao, tabela formatada, feedback e resposta natural.
+
 ### 3) API publica da biblioteca
 
 Arquivos:
diff --git a/DESENVOLVIMENTO.md b/DESENVOLVIMENTO.md
index 8811495..a77d097 100644
--- a/DESENVOLVIMENTO.md
+++ b/DESENVOLVIMENTO.md
@@ -60,8 +60,14 @@ python main.py --hitl on "Quantos pedidos existem no banco?"
 
 # biblioteca instalada (entrypoint)
 text-to-insight --hitl off "Quais categorias vendem mais?"
+
+# com modelo OpenAI
+set -a && source .env && set +a
+python main.py --hitl off --model gpt-4o-mini --api-key-env OPENAI_API_KEY "Quantos pedidos existem?"
 ```
 
+O resultado e exibido no terminal em formato tabular sob o bloco `RESULTADO:`, junto com SQL gerada, feedback do critico e resposta natural.
+
 ## Estrutura relevante
 
 ```text
diff --git a/README.md b/README.md
index 24bd3dd..d2ca441 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,27 @@ python main.py --hitl off "Quais categorias vendem mais?"
 text-to-insight --hitl on "Quantos pedidos existem no banco?"
 ```
 
+### Saida do terminal
+
+Apos a execucao, o resultado da query e exibido no bloco `RESULTADO` em formato tabular:
+
+```
+----------------------------------------------------------------------
+RESULTADO:
+----------------------------------------------------------------------
++------------+
+|   COUNT(*) |
++============+
+|      99441 |
++------------+
+Total de linhas retornadas: 1
+```
+
+O template de apresentacao usa `tabulate` para montar as linhas da query:
+- Ate 5 linhas: exibe a tabela completa
+- Acima de 5 linhas: mostra as 3 primeiras, omite as intermediarias, exibe as 2 ultimas
+- Resultado completo e exportado em CSV em `results/` automaticamente
+
 ## Testes
 
 Camadas atuais:
diff --git a/requirements.txt b/requirements.txt
index 76b8fa5..8664add 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ langchain-openai>=0.1.0
 pytest>=9.0.2
 pytest-recording>=0.13.0
 pytest-timeout>=2.3.0
-numpy
\ No newline at end of file
+numpy
+pandas>=2.0.0
+tabulate>=0.9.0
\ No newline at end of file
diff --git a/text_to_insight/nodes/code_agent/code_sql.py b/text_to_insight/nodes/code_agent/code_sql.py
index 3e52869..e737e23 100644
--- a/text_to_insight/nodes/code_agent/code_sql.py
+++ b/text_to_insight/nodes/code_agent/code_sql.py
@@ -55,7 +55,7 @@ def validar_sql_segura(sql: str) -> tuple[bool, str]:
 def executar_sql_sqlite(
     db_path: str,
     sql: str,
-    limite_preview: int = 30,
+    limite_preview: int = 5,
 ) -> dict[str, Any]:
     """
     Executa SQL validada em SQLite modo read-only e retorna resultado estruturado.
@@ -66,6 +66,7 @@ def executar_sql_sqlite(
             "ok": False,
             "erro_execucao": erro_validacao,
             "linhas_resultado_preview": [],
+            "linhas_resultado_completo": [],
             "total_linhas_resultado": 0,
             "saida_terminal": f"[SANDBOX] SQL invalida: {erro_validacao}",
         }
@@ -77,6 +78,7 @@ def executar_sql_sqlite(
             "ok": False,
             "erro_execucao": msg,
             "linhas_resultado_preview": [],
+            "linhas_resultado_completo": [],
             "total_linhas_resultado": 0,
             "saida_terminal": f"[SANDBOX] {msg}",
         }
@@ -90,11 +92,13 @@ def executar_sql_sqlite(
             rows = cur.fetchall()
             total = len(rows)
             preview_rows = [dict(r) for r in rows[:limite_preview]]
+            all_rows = [dict(r) for r in rows]
 
             return {
                 "ok": True,
                 "erro_execucao": "",
                 "linhas_resultado_preview": preview_rows,
+                "linhas_resultado_completo": all_rows,
                 "total_linhas_resultado": total,
                 "saida_terminal": (
                     f"[SANDBOX] Execucao OK | linhas_total={total} "
@@ -108,6 +112,7 @@ def executar_sql_sqlite(
             "ok": False,
             "erro_execucao": f"Falha ao executar SQL: {e}",
             "linhas_resultado_preview": [],
+            "linhas_resultado_completo": [],
             "total_linhas_resultado": 0,
             "saida_terminal": f"[SANDBOX] Erro de execucao: {e}",
         }
\ No newline at end of file
diff --git a/text_to_insight/nodes/sandbox.py b/text_to_insight/nodes/sandbox.py
index 414b9ec..ed29409 100644
--- a/text_to_insight/nodes/sandbox.py
+++ b/text_to_insight/nodes/sandbox.py
@@ -35,6 +35,7 @@ def nos_nodo_sandbox(estado: EstadoTextToInsight) -> dict:
         print(f"[EXECUTOR] SQL executada com sucesso — {resultado['total_linhas_resultado']} linhas.")
         return {
             "linhas_resultado_preview": resultado["linhas_resultado_preview"],
+            "linhas_resultado_completo": resultado["linhas_resultado_completo"],
             "total_linhas_resultado": resultado["total_linhas_resultado"],
             "saida_terminal": resultado["saida_terminal"],
             "erro_execucao": "",
@@ -44,6 +45,7 @@ def nos_nodo_sandbox(estado: EstadoTextToInsight) -> dict:
         print(f"[EXECUTOR] Erro na execução: {resultado['erro_execucao']}")
         return {
             "linhas_resultado_preview": [],
+            "linhas_resultado_completo": [],
             "total_linhas_resultado": 0,
             "saida_terminal": resultado["saida_terminal"],
             "erro_execucao": resultado["erro_execucao"],
diff --git a/text_to_insight/runtime.py b/text_to_insight/runtime.py
index c1a7114..d41bb78 100644
--- a/text_to_insight/runtime.py
+++ b/text_to_insight/runtime.py
@@ -1,8 +1,13 @@
 from __future__ import annotations
 
+import csv
 import time
+from datetime import datetime
+from pathlib import Path
 from typing import Any, Callable
 
+from tabulate import tabulate
+
 from .utils import salvar_metricas_csv
 
 HITL_AWAITING_STATUS = "AWAITING_USER"
@@ -23,9 +28,59 @@ def construir_estado_inicial(pergunta: str, db_path: str) -> dict[str, Any]:
         "tentativas_loop": 0,
         "db_path": db_path,
         "espera_humana": False,
+        "linhas_resultado_completo": [],
     }
 
 
+def _montar_saida_resultado_terminal(resultado: dict[str, Any]) -> str:
+    """Monta o texto do bloco de resultado para exibição no terminal."""
+    linhas = resultado.get("linhas_resultado_completo", []) or []
+    if not linhas:
+        linhas = resultado.get("linhas_resultado_preview", []) or []
+    total = int(resultado.get("total_linhas_resultado", 0) or 0)
+
+    if not linhas:
+        return "[Nenhum resultado]"
+
+    colunas = list(linhas[0].keys()) if isinstance(linhas[0], dict) else []
+    if not colunas:
+        return "[Resultado indisponivel para exibicao]"
+
+    def _formatar_tabela(amostras: list[dict[str, Any]]) -> str:
+        return tabulate(amostras, headers="keys", tablefmt="grid", showindex=False)
+
+    partes: list[str] = []
+
+    if len(linhas) <= 5:
+        partes.append(_formatar_tabela(linhas))
+    else:
+        partes.append(_formatar_tabela(linhas[:3]))
+        partes.append(f"... (omitted {len(linhas) - 5} rows) ...")
+        partes.append(_formatar_tabela(linhas[-2:]))
+
+    partes.append(f"Total de linhas retornadas: {total}")
+    return "\n".join(partes)
+
+
+def salvar_resultado_csv(resultado: dict[str, Any], pasta_resultados: Path | None = None) -> Path | None:
+    """Salva o resultado completo em CSV quando houver linhas para exportar."""
+    linhas = resultado.get("linhas_resultado_completo", []) or []
+    if not linhas:
+        return None
+
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    resultados_dir = pasta_resultados or (Path(__file__).parent.parent / "results")
+    resultados_dir.mkdir(exist_ok=True)
+    csv_path = resultados_dir / f"query_{timestamp}.csv"
+
+    with csv_path.open("w", newline="", encoding="utf-8") as arquivo_csv:
+        writer = csv.DictWriter(arquivo_csv, fieldnames=list(linhas[0].keys()))
+        writer.writeheader()
+        writer.writerows(linhas)
+
+    return csv_path
+
+
 def exibir_resultado_console(resultado: dict[str, Any]) -> None:
     """Exibe o resultado final de forma consistente entre CLI e engine."""
     print("\n" + "=" * 70)
@@ -47,18 +102,17 @@ def exibir_resultado_console(resultado: dict[str, Any]) -> None:
     saida = str(resultado.get("saida_terminal", "")).strip()
     print(saida if saida else "[Nenhuma saida]")
 
+    # Nova lógica: Exibir resultado como DataFrame
     print("\n" + "-" * 70)
-    print("RESULTADO (preview):")
+    print("RESULTADO:")
     print("-" * 70)
-    preview = resultado.get("linhas_resultado_preview", []) or []
-    total = int(resultado.get("total_linhas_resultado", 0) or 0)
-    if preview:
-        for row in preview[:10]:
-            print(row)
-        if total > 10:
-            print(f"... ({total - 10} linhas omitidas)")
-    else:
-        print("[Nenhum resultado]")
+
+    print(_montar_saida_resultado_terminal(resultado))
+
+    csv_path = salvar_resultado_csv(resultado)
+    if csv_path is not None:
+        total = int(resultado.get("total_linhas_resultado", 0) or 0)
+        print(f"\n✓ Resultados completos salvos em: {csv_path.as_posix()} ({total} linhas)")
 
     print("\n" + "-" * 70)
     print("FEEDBACK DO CRITICO:")

From 9abf0d3740cbf2060e2a6008e9efed80a3ec0eb9 Mon Sep 17 00:00:00 2001
From: Petroncini <caiopetroncini@gmail.com>
Date: Wed, 6 May 2026 11:03:34 -0300
Subject: [PATCH 09/10] feat: adicionei historico de tentativas e feedbacks ao
 estado do grafo e prompt do agente de codigo

---
 src/spider/csv_reporter.py                    |  3 +
 src/spider/metrics.py                         | 71 ++++++++++++++++++-
 text_to_insight/graph.py                      |  9 ++-
 .../nodes/code_agent/code_agent.py            | 37 ++++++----
 text_to_insight/nodes/critic.py               | 30 ++++++++
 text_to_insight/runtime.py                    |  1 +
 text_to_insight/state.py                      |  1 +
 7 files changed, 138 insertions(+), 14 deletions(-)

diff --git a/src/spider/csv_reporter.py b/src/spider/csv_reporter.py
index 8ab16b0..249a687 100644
--- a/src/spider/csv_reporter.py
+++ b/src/spider/csv_reporter.py
@@ -29,6 +29,9 @@ class CSVReporter:
         "erro_execucao",
         "resultado_exato_match",
         "similarity_score_sql",
+        "resultado_f1",
+        "resultado_precision",
+        "resultado_recall",
     ]
 
     def __init__(self, filepath: str | Path):
diff --git a/src/spider/metrics.py b/src/spider/metrics.py
index 47c7b10..5fd0ad9 100644
--- a/src/spider/metrics.py
+++ b/src/spider/metrics.py
@@ -4,12 +4,15 @@
 Fornece:
 - Similarity score entre duas queries (difflib-based)
 - Comparação de resultados (exato match)
+- F1 score de resultados (row-level precision/recall)
 - Normalização de SQL para comparação
 """
 
 import difflib
 import re
+from collections import Counter
 from typing import Any
+
 import numpy as np
 
 
@@ -153,6 +156,62 @@ def extract_values_to_numpy(results: list[dict[str, Any]]) -> np.ndarray:
     return bool(np.array_equal(gold_array, agent_array))
 
 
+def results_f1_score(
+    results_gold: list[dict[str, Any]],
+    results_agent: list[dict[str, Any]],
+) -> dict[str, float]:
+    """
+    Calcula Precision, Recall e F1 row-level entre resultados gold e agent.
+
+    Cada linha é convertida em uma tupla canônica (valores ordenados, como string)
+    e tratada como membro de um multiset (Counter). Isso permite medir parcialmente
+    quantas linhas o agente acertou, mesmo que não tenha acertado todas.
+
+    - Precision: das linhas que o agente retornou, quantas estão no gold?
+    - Recall:    das linhas do gold, quantas o agente retornou?
+    - F1:        média harmônica de precision e recall.
+
+    Args:
+        results_gold: Resultados da query ouro
+        results_agent: Resultados da query do agente
+
+    Returns:
+        Dict com chaves: precision, recall, f1 (floats de 0 a 1)
+    """
+    def _row_to_canonical(row: dict[str, Any]) -> tuple:
+        """Converte uma linha em tupla canônica de valores (ordenados, stringificados)."""
+        values = [str(v) if v is not None else "NULL" for v in row.values()]
+        return tuple(sorted(values))
+
+    # Ambos vazios → match perfeito
+    if not results_gold and not results_agent:
+        return {"precision": 1.0, "recall": 1.0, "f1": 1.0}
+
+    # Um vazio e outro não
+    if not results_gold:
+        return {"precision": 0.0, "recall": 1.0, "f1": 0.0}
+    if not results_agent:
+        return {"precision": 1.0, "recall": 0.0, "f1": 0.0}
+
+    gold_bag = Counter(_row_to_canonical(r) for r in results_gold)
+    agent_bag = Counter(_row_to_canonical(r) for r in results_agent)
+
+    # Interseção: min(count_gold, count_agent) para cada tupla
+    true_positives = sum((gold_bag & agent_bag).values())
+    total_agent = sum(agent_bag.values())
+    total_gold = sum(gold_bag.values())
+
+    precision = true_positives / total_agent if total_agent > 0 else 0.0
+    recall = true_positives / total_gold if total_gold > 0 else 0.0
+
+    if precision + recall == 0:
+        f1 = 0.0
+    else:
+        f1 = 2 * (precision * recall) / (precision + recall)
+
+    return {"precision": round(precision, 4), "recall": round(recall, 4), "f1": round(f1, 4)}
+
+
 def build_comparison_row(
     id_exemplo: int,
     tentativa_numero: int,
@@ -166,6 +225,9 @@ def build_comparison_row(
     erro_execucao: str,
     resultado_exato_match: bool | None,
     similarity_score: float,
+    resultado_f1: float = 0.0,
+    resultado_precision: float = 0.0,
+    resultado_recall: float = 0.0,
 ) -> dict[str, Any]:
     """
     Constrói uma linha para o CSV de avaliação.
@@ -183,9 +245,12 @@ def build_comparison_row(
         erro_execucao: Mensagem de erro (vazio se OK)
         resultado_exato_match: True/False se resultado foi exato (None se erro)
         similarity_score: Score 0-1
+        resultado_f1: F1 score row-level (0-1)
+        resultado_precision: Precision row-level (0-1)
+        resultado_recall: Recall row-level (0-1)
 
     Returns:
-        Dict com 12 chaves para CSV
+        Dict com 15 chaves para CSV
     """
     return {
         "id_exemplo": id_exemplo,
@@ -200,4 +265,8 @@ def build_comparison_row(
         "erro_execucao": erro_execucao,
         "resultado_exato_match": resultado_exato_match if resultado_exato_match is not None else "",
         "similarity_score_sql": round(similarity_score, 4),
+        "resultado_f1": resultado_f1,
+        "resultado_precision": resultado_precision,
+        "resultado_recall": resultado_recall,
     }
+
diff --git a/text_to_insight/graph.py b/text_to_insight/graph.py
index d4edfd8..74d0ccf 100644
--- a/text_to_insight/graph.py
+++ b/text_to_insight/graph.py
@@ -81,11 +81,18 @@ def _construir_grafo_text_to_insight(self, hitl: bool) -> StateGraph:
             }
         )
 
+        MAX_TENTATIVAS_CRITICO = 3
+
         def roteador_critico(estado: EstadoTextToInsight) -> str:
             status = estado.get("status", "")
-            # Se aprovado, enviar para nó de resposta; senão retornar ao planejador
+            tentativas = estado.get("tentativas_loop", 0)
+            # Se aprovado, enviar para nó de resposta
             if status == "aprovado":
                 return "resposta"
+            # Se atingiu limite de tentativas, encerrar mesmo reprovado
+            if tentativas >= MAX_TENTATIVAS_CRITICO:
+                print(f"[ROTEADOR_CRITICO] Limite de {MAX_TENTATIVAS_CRITICO} tentativas atingido → resposta (forçado)")
+                return "resposta"
             return "planejador"
 
         construtor_grafo.add_conditional_edges(
diff --git a/text_to_insight/nodes/code_agent/code_agent.py b/text_to_insight/nodes/code_agent/code_agent.py
index e0557ba..a2d5d0c 100644
--- a/text_to_insight/nodes/code_agent/code_agent.py
+++ b/text_to_insight/nodes/code_agent/code_agent.py
@@ -32,8 +32,8 @@
 === CONVERSA PRÉVIA (CONTEXTO ADICIONAL) ===
 {conversa_previa}
 
-=== FEEDBACK CRÍTICO (SE HOUVER) ===
-{feedback_section}
+=== HISTÓRICO DE TENTATIVAS ANTERIORES ===
+{historico_tentativas_section}
 
 Responda APENAS com a consulta SQL, sem markdown, sem explicação."""
 
@@ -48,6 +48,24 @@ def _extrair_sql(resposta: str) -> str:
     return resposta.strip()
 
 
+def _formatar_historico_tentativas(historico: list[dict]) -> str:
+    """Formata o histórico de tentativas anteriores para inclusão no prompt."""
+    if not historico:
+        return "Nenhuma tentativa anterior."
+
+    partes = []
+    for i, tent in enumerate(historico, 1):
+        bloco = f"--- Tentativa {i} ---\n"
+        bloco += f"SQL gerada:\n{tent.get('sql', '(vazia)')}\n"
+        if tent.get("erro"):
+            bloco += f"Erro de execução: {tent['erro']}\n"
+        if tent.get("feedback"):
+            bloco += f"Feedback do crítico: {tent['feedback']}\n"
+        partes.append(bloco)
+
+    return "\n".join(partes) + "\nNÃO repita os mesmos erros. Gere uma SQL diferente e corrigida."
+
+
 def nos_nodo_agente_codigo(estado: EstadoTextToInsight, llm: ChatGoogleGenerativeAI) -> dict:
     """
     Nó Agente de Código: usa Gemini para gerar SQL a partir da pergunta + schema.
@@ -55,26 +73,20 @@ def nos_nodo_agente_codigo(estado: EstadoTextToInsight, llm: ChatGoogleGenerativ
     pergunta = estado.get("pergunta_usuario", "")
     conversa_previa = estado.get("historico_conversa", "")
     schema = estado.get("contexto_schema", "")
-    feedback = estado.get("feedback_critico", "")
+    historico = estado.get("historico_tentativas", [])
     tentativas = estado.get("tentativas_loop", 0)
 
     print(f"[AGENTE_CODIGO] Gerando SQL (tentativa {tentativas + 1})...")
 
-    feedback_section = ""
-    if feedback:
-        feedback_section = f"""=== FEEDBACK DO CRÍTICO (corrija os problemas apontados) ===
-        {feedback}
-
-        === SQL ANTERIOR (que foi reprovada) ===
-        {estado.get('sql_gerada', '')}"""
+    historico_section = _formatar_historico_tentativas(historico)
 
     prompt = PROMPT_TEMPLATE.format(
         schema=schema,
         pergunta=pergunta,
         conversa_previa=conversa_previa if conversa_previa else "Nenhuma",
-        feedback_section=feedback_section,
+        historico_tentativas_section=historico_section,
     )
-
+    
     resposta = llm.invoke(prompt)
     sql = _extrair_sql(resposta.content)
 
@@ -91,3 +103,4 @@ def nos_nodo_agente_codigo(estado: EstadoTextToInsight, llm: ChatGoogleGenerativ
         "tokens_output": out_tokens,
         "tokens_total": total_tokens,
     }
+
diff --git a/text_to_insight/nodes/critic.py b/text_to_insight/nodes/critic.py
index af1bfbf..2831a26 100644
--- a/text_to_insight/nodes/critic.py
+++ b/text_to_insight/nodes/critic.py
@@ -33,10 +33,14 @@
 === ERROS (se houver) ===
 {erro}
 
+=== TENTATIVAS ANTERIORES ===
+{historico_tentativas_section}
+
 Avalie:
 1. A SQL responde à pergunta do usuário?
 2. Os resultados fazem sentido?
 3. Há algum erro lógico ou de interpretação?
+4. Se houve tentativas anteriores, verifique se os mesmos problemas persistem.
 
 Ao avaliar, priorize utilidade prática e correção semântica da resposta,
 não perfeição formal.
@@ -69,6 +73,24 @@
 FEEDBACK: <sua avaliação em 1-3 frases>"""
 
 
+def _formatar_historico_para_critico(historico: list[dict]) -> str:
+    """Formata o histórico de tentativas anteriores para o prompt do crítico."""
+    if not historico:
+        return "Nenhuma tentativa anterior (esta é a primeira)."
+
+    partes = []
+    for i, tent in enumerate(historico, 1):
+        bloco = f"--- Tentativa {i} ---\n"
+        bloco += f"SQL: {tent.get('sql', '(vazia)')}\n"
+        if tent.get("erro"):
+            bloco += f"Erro: {tent['erro']}\n"
+        if tent.get("feedback"):
+            bloco += f"Feedback: {tent['feedback']}\n"
+        partes.append(bloco)
+
+    return "\n".join(partes)
+
+
 def nos_nodo_critico(estado: EstadoTextToInsight, llm: ChatGoogleGenerativeAI) -> dict:
     """
     Nó Crítico: usa Gemini para avaliar qualidade do resultado.
@@ -81,6 +103,7 @@ def nos_nodo_critico(estado: EstadoTextToInsight, llm: ChatGoogleGenerativeAI) -
     conversa_previa = estado.get("historico_conversa", "")
     erro = estado.get("erro_execucao", "")
     status_exec = estado.get("status", "")
+    historico = estado.get("historico_tentativas", [])
 
     print("[CRITICO] Avaliando resultado...")
 
@@ -91,10 +114,13 @@ def nos_nodo_critico(estado: EstadoTextToInsight, llm: ChatGoogleGenerativeAI) -
         return {
             "feedback_critico": feedback,
             "status": "reprovado",
+            # Registrar tentativa com erro no histórico
+            "historico_tentativas": [{"sql": sql, "erro": erro, "feedback": feedback}],
         }
 
     # Formata preview para o prompt
     preview_str = str(preview[:10]) if preview else "Nenhum resultado"
+    historico_section = _formatar_historico_para_critico(historico)
 
     prompt = PROMPT_CRITIC.format(
         pergunta=pergunta,
@@ -104,6 +130,7 @@ def nos_nodo_critico(estado: EstadoTextToInsight, llm: ChatGoogleGenerativeAI) -
         total_linhas=total,
         preview=preview_str,
         erro=erro if erro else "Nenhum",
+        historico_tentativas_section=historico_section,
     )
 
     resposta = llm.invoke(prompt)
@@ -134,4 +161,7 @@ def nos_nodo_critico(estado: EstadoTextToInsight, llm: ChatGoogleGenerativeAI) -
         "tokens_input": in_tokens,
         "tokens_output": out_tokens,
         "tokens_total": total_tokens,
+        # Registrar esta tentativa no histórico (acumula via operator.add)
+        "historico_tentativas": [{"sql": sql, "feedback": feedback}],
     }
+
diff --git a/text_to_insight/runtime.py b/text_to_insight/runtime.py
index c1a7114..f8fe9b2 100644
--- a/text_to_insight/runtime.py
+++ b/text_to_insight/runtime.py
@@ -23,6 +23,7 @@ def construir_estado_inicial(pergunta: str, db_path: str) -> dict[str, Any]:
         "tentativas_loop": 0,
         "db_path": db_path,
         "espera_humana": False,
+        "historico_tentativas": [],
     }
 
 
diff --git a/text_to_insight/state.py b/text_to_insight/state.py
index 9d32550..fed878b 100644
--- a/text_to_insight/state.py
+++ b/text_to_insight/state.py
@@ -64,6 +64,7 @@ class EstadoTextToInsight(EstadoEntrada, total = False):
     historico_conversa: list[tuple[str, str]]
     tentativas_loop: int
     resposta_natural: str
+    historico_tentativas: Annotated[list[dict[str, str]], operator.add]
 
     # Campos exclusivos para métricas. Possibilita a soma automática dos tokens utilizados
     # por cada chamada do Gemini nos vários diferentes nós.

From 462453a3a8f5bf43ea9c148255e52348f6834265 Mon Sep 17 00:00:00 2001
From: Petroncini <caiopetroncini@gmail.com>
Date: Thu, 7 May 2026 11:20:10 -0300
Subject: [PATCH 10/10] fix: adidionei linhas resultado completo no estado do
 grafo

---
 text_to_insight/state.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/text_to_insight/state.py b/text_to_insight/state.py
index fed878b..7daf685 100644
--- a/text_to_insight/state.py
+++ b/text_to_insight/state.py
@@ -65,6 +65,7 @@ class EstadoTextToInsight(EstadoEntrada, total = False):
     tentativas_loop: int
     resposta_natural: str
     historico_tentativas: Annotated[list[dict[str, str]], operator.add]
+    linhas_resultado_completo: list[dict[str, Any]]
 
     # Campos exclusivos para métricas. Possibilita a soma automática dos tokens utilizados
     # por cada chamada do Gemini nos vários diferentes nós.