From 6718e66582dd44275ea00a9dd40289bb583bae90 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 14 Feb 2026 20:55:06 -0500 Subject: [PATCH 1/5] feat: add private tessl tiles for codeflash rules, docs, and skills Three private tiles in the codeflash workspace: - codeflash-rules: 6 steering rules (code-style, architecture, optimization-patterns, git-conventions, testing-rules, language-rules) - codeflash-docs: 7 doc pages (domain-types, optimization-pipeline, context-extraction, verification, ai-service, configuration) - codeflash-skills: 2 skills (debug-optimization-failure, add-codeflash-feature) --- CLAUDE.md | 2 + tessl.json | 9 ++ tiles/codeflash-docs/docs/ai-service.md | 108 +++++++++++++ tiles/codeflash-docs/docs/configuration.md | 79 +++++++++ .../codeflash-docs/docs/context-extraction.md | 60 +++++++ tiles/codeflash-docs/docs/domain-types.md | 153 ++++++++++++++++++ tiles/codeflash-docs/docs/index.md | 41 +++++ .../docs/optimization-pipeline.md | 84 ++++++++++ tiles/codeflash-docs/docs/verification.md | 93 +++++++++++ tiles/codeflash-docs/tile.json | 7 + tiles/codeflash-rules/rules/architecture.md | 45 ++++++ tiles/codeflash-rules/rules/code-style.md | 11 ++ .../codeflash-rules/rules/git-conventions.md | 9 ++ tiles/codeflash-rules/rules/language-rules.md | 9 ++ .../rules/optimization-patterns.md | 11 ++ tiles/codeflash-rules/rules/testing-rules.md | 13 ++ tiles/codeflash-rules/tile.json | 26 +++ .../skills/add-codeflash-feature/SKILL.md | 96 +++++++++++ .../debug-optimization-failure/SKILL.md | 95 +++++++++++ tiles/codeflash-skills/tile.json | 14 ++ 20 files changed, 965 insertions(+) create mode 100644 tiles/codeflash-docs/docs/ai-service.md create mode 100644 tiles/codeflash-docs/docs/configuration.md create mode 100644 tiles/codeflash-docs/docs/context-extraction.md create mode 100644 tiles/codeflash-docs/docs/domain-types.md create mode 100644 tiles/codeflash-docs/docs/index.md create mode 100644 tiles/codeflash-docs/docs/optimization-pipeline.md create mode 100644 tiles/codeflash-docs/docs/verification.md create mode 100644 tiles/codeflash-docs/tile.json create mode 100644 tiles/codeflash-rules/rules/architecture.md create mode 100644 tiles/codeflash-rules/rules/code-style.md create mode 100644 tiles/codeflash-rules/rules/git-conventions.md create mode 100644 tiles/codeflash-rules/rules/language-rules.md create mode 100644 tiles/codeflash-rules/rules/optimization-patterns.md create mode 100644 tiles/codeflash-rules/rules/testing-rules.md create mode 100644 tiles/codeflash-rules/tile.json create mode 100644 tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md create mode 100644 tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md create mode 100644 tiles/codeflash-skills/tile.json diff --git a/CLAUDE.md b/CLAUDE.md index 33fbd0f69..622351db4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,3 +33,5 @@ Discovery → Ranking → Context Extraction → Test Gen + Optimization → Bas # Agent Rules @.tessl/RULES.md follow the [instructions](.tessl/RULES.md) + +@AGENTS.md diff --git a/tessl.json b/tessl.json index b05a1df44..7061e2c97 100644 --- a/tessl.json +++ b/tessl.json @@ -63,6 +63,15 @@ }, "tessl/pypi-filelock": { "version": "3.19.0" + }, + "codeflash/codeflash-rules": { + "version": "0.1.0" + }, + "codeflash/codeflash-docs": { + "version": "0.1.0" + }, + "codeflash/codeflash-skills": { + "version": "0.1.0" } } } diff --git a/tiles/codeflash-docs/docs/ai-service.md b/tiles/codeflash-docs/docs/ai-service.md new file mode 100644 index 000000000..4197a97d0 --- /dev/null +++ b/tiles/codeflash-docs/docs/ai-service.md @@ -0,0 +1,108 @@ +# AI Service + +How codeflash communicates with the AI optimization backend. + +## `AiServiceClient` (`api/aiservice.py`) + +The client connects to the AI service at `https://app.codeflash.ai` (or `http://localhost:8000` when `CODEFLASH_AIS_SERVER=local`). + +Authentication uses Bearer token from `get_codeflash_api_key()`. All requests go through `make_ai_service_request()` which handles JSON serialization via Pydantic encoder. + +Timeout: 90s for production, 300s for local. + +## Endpoints + +### `/ai/optimize` — Generate Candidates + +Method: `optimize_code()` + +Sends source code + dependency context to generate optimization candidates. + +Payload: +- `source_code` — The read-writable code (markdown format) +- `dependency_code` — Read-only context code +- `trace_id` — Unique trace ID for the optimization run +- `language` — `"python"`, `"javascript"`, or `"typescript"` +- `n_candidates` — Number of candidates to generate (controlled by effort level) +- `is_async` — Whether the function is async +- `is_numerical_code` — Whether the code is numerical (affects optimization strategy) + +Returns: `list[OptimizedCandidate]` with `source=OptimizedCandidateSource.OPTIMIZE` + +### `/ai/optimize_line_profiler` — Line-Profiler-Guided Candidates + +Method: `optimize_python_code_line_profiler()` + +Like `/optimize` but includes `line_profiler_results` to guide the LLM toward hot lines. + +Returns: candidates with `source=OptimizedCandidateSource.OPTIMIZE_LP` + +### `/ai/refine` — Refine Existing Candidate + +Method: `refine_code()` + +Request type: `AIServiceRefinerRequest` + +Sends an existing candidate with runtime data and line profiler results to generate an improved version. + +Key fields: +- `original_source_code` / `optimized_source_code` — Before and after +- `original_code_runtime` / `optimized_code_runtime` — Timing data +- `speedup` — Current speedup ratio +- `original_line_profiler_results` / `optimized_line_profiler_results` + +Returns: candidates with `source=OptimizedCandidateSource.REFINE` and `parent_id` set to the refined candidate's ID + +### `/ai/repair` — Fix Failed Candidate + +Method: `repair_code()` + +Request type: `AIServiceCodeRepairRequest` + +Sends a failed candidate with test diffs showing what went wrong. + +Key fields: +- `original_source_code` / `modified_source_code` +- `test_diffs: list[TestDiff]` — Each with `scope` (return_value/stdout/did_pass), original vs candidate values, and test source code + +Returns: candidates with `source=OptimizedCandidateSource.REPAIR` and `parent_id` set + +### `/ai/adaptive_optimize` — Multi-Candidate Adaptive + +Method: `adaptive_optimize()` + +Request type: `AIServiceAdaptiveOptimizeRequest` + +Sends multiple previous candidates with their speedups for the LLM to learn from and generate better candidates. + +Key fields: +- `candidates: list[AdaptiveOptimizedCandidate]` — Previous candidates with source code, explanation, source type, and speedup + +Returns: candidates with `source=OptimizedCandidateSource.ADAPTIVE` + +### `/ai/rewrite_jit` — JIT Rewrite + +Method: `get_jit_rewritten_code()` + +Rewrites code to use JIT compilation (e.g., Numba). + +Returns: candidates with `source=OptimizedCandidateSource.JIT_REWRITE` + +## Candidate Parsing + +All endpoints return JSON with an `optimizations` array. Each entry has: +- `source_code` — Markdown-formatted code blocks +- `explanation` — LLM explanation +- `optimization_id` — Unique ID +- `parent_id` — Optional parent reference +- `model` — Which LLM model was used + +`_get_valid_candidates()` parses the markdown code via `CodeStringsMarkdown.parse_markdown_code()` and filters out entries with empty code blocks. + +## `LocalAiServiceClient` + +Used when `CODEFLASH_EXPERIMENT_ID` is set. Mirrors `AiServiceClient` but sends to a separate experimental endpoint for A/B testing optimization strategies. + +## LLM Call Sequencing + +`AiServiceClient` tracks call sequence via `llm_call_counter` (itertools.count). Each request includes a `call_sequence` number, used by the backend to maintain conversation context across multiple calls for the same function. diff --git a/tiles/codeflash-docs/docs/configuration.md b/tiles/codeflash-docs/docs/configuration.md new file mode 100644 index 000000000..32dd8d53d --- /dev/null +++ b/tiles/codeflash-docs/docs/configuration.md @@ -0,0 +1,79 @@ +# Configuration + +Key configuration constants, effort levels, and thresholds. + +## Constants (`code_utils/config_consts.py`) + +### Test Execution + +| Constant | Value | Description | +|----------|-------|-------------| +| `MAX_TEST_RUN_ITERATIONS` | 5 | Maximum test loop iterations | +| `INDIVIDUAL_TESTCASE_TIMEOUT` | 15s | Timeout per individual test case | +| `MAX_FUNCTION_TEST_SECONDS` | 60s | Max total time for function testing | +| `MAX_TEST_FUNCTION_RUNS` | 50 | Max test function executions | +| `MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS` | 100ms | Max cumulative test runtime | +| `TOTAL_LOOPING_TIME` | 10s | Candidate benchmarking budget | +| `MIN_TESTCASE_PASSED_THRESHOLD` | 6 | Minimum test cases that must pass | + +### Performance Thresholds + +| Constant | Value | Description | +|----------|-------|-------------| +| `MIN_IMPROVEMENT_THRESHOLD` | 0.05 (5%) | Minimum speedup to accept a candidate | +| `MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD` | 0.10 (10%) | Minimum async throughput improvement | +| `MIN_CONCURRENCY_IMPROVEMENT_THRESHOLD` | 0.20 (20%) | Minimum concurrency ratio improvement | +| `COVERAGE_THRESHOLD` | 60.0% | Minimum test coverage | + +### Stability Thresholds + +| Constant | Value | Description | +|----------|-------|-------------| +| `STABILITY_WINDOW_SIZE` | 0.35 | 35% of total iteration window | +| `STABILITY_CENTER_TOLERANCE` | 0.0025 | ±0.25% around median | +| `STABILITY_SPREAD_TOLERANCE` | 0.0025 | 0.25% window spread | + +### Context Limits + +| Constant | Value | Description | +|----------|-------|-------------| +| `OPTIMIZATION_CONTEXT_TOKEN_LIMIT` | 16000 | Max tokens for optimization context | +| `TESTGEN_CONTEXT_TOKEN_LIMIT` | 16000 | Max tokens for test generation context | +| `MAX_CONTEXT_LEN_REVIEW` | 1000 | Max context length for optimization review | + +### Other + +| Constant | Value | Description | +|----------|-------|-------------| +| `MIN_CORRECT_CANDIDATES` | 2 | Min correct candidates before skipping repair | +| `REPEAT_OPTIMIZATION_PROBABILITY` | 0.1 | Probability of re-optimizing a function | +| `DEFAULT_IMPORTANCE_THRESHOLD` | 0.001 | Minimum addressable time to consider a function | +| `CONCURRENCY_FACTOR` | 10 | Number of concurrent executions for concurrency benchmark | +| `REFINED_CANDIDATE_RANKING_WEIGHTS` | (2, 1) | (runtime, diff) weights — runtime 2x more important | + +## Effort Levels + +`EffortLevel` enum: `LOW`, `MEDIUM`, `HIGH` + +Effort controls the number of candidates, repairs, and refinements: + +| Key | LOW | MEDIUM | HIGH | +|-----|-----|--------|------| +| `N_OPTIMIZER_CANDIDATES` | 3 | 5 | 6 | +| `N_OPTIMIZER_LP_CANDIDATES` | 4 | 6 | 7 | +| `N_GENERATED_TESTS` | 2 | 2 | 2 | +| `MAX_CODE_REPAIRS_PER_TRACE` | 2 | 3 | 5 | +| `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` | 0.2 | 0.3 | 0.4 | +| `TOP_VALID_CANDIDATES_FOR_REFINEMENT` | 2 | 3 | 4 | +| `ADAPTIVE_OPTIMIZATION_THRESHOLD` | 0 | 0 | 2 | +| `MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE` | 0 | 0 | 4 | + +Use `get_effort_value(EffortKeys.KEY, effort_level)` to retrieve values. + +## Project Configuration + +Configuration is read from `pyproject.toml` under `[tool.codeflash]`. Key settings are auto-detected by `setup/detector.py`: +- `module-root` — Root of the module to optimize +- `tests-root` — Root of test files +- `test-framework` — pytest, unittest, jest, etc. +- `formatter-cmds` — Code formatting commands diff --git a/tiles/codeflash-docs/docs/context-extraction.md b/tiles/codeflash-docs/docs/context-extraction.md new file mode 100644 index 000000000..8e0f366c9 --- /dev/null +++ b/tiles/codeflash-docs/docs/context-extraction.md @@ -0,0 +1,60 @@ +# Context Extraction + +How codeflash extracts and limits code context for optimization and test generation. + +## Overview + +Context extraction (`context/code_context_extractor.py`) builds a `CodeOptimizationContext` containing all code needed for the LLM to understand and optimize a function, split into: + +- **Read-writable code** (`CodeContextType.READ_WRITABLE`): The function being optimized plus its helper functions — code the LLM is allowed to modify +- **Read-only context** (`CodeContextType.READ_ONLY`): Dependency code for reference — imports, type definitions, base classes +- **Testgen context** (`CodeContextType.TESTGEN`): Context for test generation, may include imported class definitions and external base class inits +- **Hashing context** (`CodeContextType.HASHING`): Used for deduplication of optimization runs + +## Token Limits + +Both optimization and test generation contexts are token-limited: +- `OPTIMIZATION_CONTEXT_TOKEN_LIMIT = 16000` tokens +- `TESTGEN_CONTEXT_TOKEN_LIMIT = 16000` tokens + +Token counting uses `encoded_tokens_len()` from `code_utils/code_utils.py`. Functions whose context exceeds these limits are skipped. + +## Context Building Process + +### 1. Helper Discovery + +For the target function (`FunctionToOptimize`), the extractor finds: +- **Helpers of the function**: Functions/classes in the same file that the target function calls +- **Helpers of helpers**: Transitive dependencies of the helper functions + +These are organized as `dict[Path, set[FunctionSource]]` — mapping file paths to the set of helper functions found in each file. + +### 2. Code Extraction + +`extract_code_markdown_context_from_files()` builds `CodeStringsMarkdown` from the helper dictionaries. Each file's relevant code is extracted as a `CodeString` with its file path. + +### 3. Testgen Context Enrichment + +`build_testgen_context()` extends the basic context with: +- Imported class definitions (resolved from imports) +- External base class `__init__` methods +- External class `__init__` methods referenced in the context + +### 4. Unused Definition Removal + +`detect_unused_helper_functions()` and `remove_unused_definitions_by_function_names()` from `context/unused_definition_remover.py` prune definitions that are not transitively reachable from the target function, reducing token usage. + +### 5. Deduplication + +The hashing context (`hashing_code_context`) generates a hash (`hashing_code_context_hash`) used to detect when the same function context has already been optimized in a previous run, avoiding redundant work. + +## Key Functions + +| Function | Location | Purpose | +|----------|----------|---------| +| `build_testgen_context()` | `context/code_context_extractor.py` | Build enriched testgen context | +| `extract_code_markdown_context_from_files()` | `context/code_context_extractor.py` | Convert helper dicts to `CodeStringsMarkdown` | +| `detect_unused_helper_functions()` | `context/unused_definition_remover.py` | Find unused definitions | +| `remove_unused_definitions_by_function_names()` | `context/unused_definition_remover.py` | Remove unused definitions | +| `collect_top_level_defs_with_usages()` | `context/unused_definition_remover.py` | Analyze definition usage | +| `encoded_tokens_len()` | `code_utils/code_utils.py` | Count tokens in code | diff --git a/tiles/codeflash-docs/docs/domain-types.md b/tiles/codeflash-docs/docs/domain-types.md new file mode 100644 index 000000000..7bc2dd868 --- /dev/null +++ b/tiles/codeflash-docs/docs/domain-types.md @@ -0,0 +1,153 @@ +# Domain Types + +Core data types used throughout the codeflash optimization pipeline. + +## Function Representation + +### `FunctionToOptimize` (`models/function_types.py`) + +The canonical dataclass representing a function candidate for optimization. Works across Python, JavaScript, and TypeScript. + +Key fields: +- `function_name: str` — The function name +- `file_path: Path` — Absolute file path where the function is located +- `parents: list[FunctionParent]` — Parent scopes (classes/functions), each with `name` and `type` +- `starting_line / ending_line: Optional[int]` — Line range (1-indexed) +- `is_async: bool` — Whether the function is async +- `is_method: bool` — Whether it belongs to a class +- `language: str` — Programming language (default: `"python"`) + +Key properties: +- `qualified_name` — Full dotted name including parent classes (e.g., `MyClass.my_method`) +- `top_level_parent_name` — Name of outermost parent, or function name if no parents +- `class_name` — Immediate parent class name, or `None` + +### `FunctionParent` (`models/function_types.py`) + +Represents a parent scope: `name: str` (e.g., `"MyClass"`) and `type: str` (e.g., `"ClassDef"`). + +### `FunctionSource` (`models/models.py`) + +Represents a resolved function with source code. Used for helper functions in context extraction. + +Fields: `file_path`, `qualified_name`, `fully_qualified_name`, `only_function_name`, `source_code`, `jedi_definition`. + +## Code Representation + +### `CodeString` (`models/models.py`) + +A single code block with validated syntax: +- `code: str` — The source code +- `file_path: Optional[Path]` — Origin file path +- `language: str` — Language for validation (default: `"python"`) + +Validates syntax on construction via `model_validator`. + +### `CodeStringsMarkdown` (`models/models.py`) + +A collection of `CodeString` blocks — the primary format for passing code through the pipeline. + +Key properties: +- `.flat` — Combined source code with file-path comment prefixes (e.g., `# file: path/to/file.py`) +- `.markdown` — Markdown-formatted with fenced code blocks: `` ```python:filepath\ncode\n``` `` +- `.file_to_path()` — Dict mapping file path strings to code + +Static method: +- `parse_markdown_code(markdown_code, expected_language)` — Parses markdown code blocks back into `CodeStringsMarkdown` + +## Optimization Context + +### `CodeOptimizationContext` (`models/models.py`) + +Holds all code context needed for optimization: +- `read_writable_code: CodeStringsMarkdown` — Code the LLM can modify +- `read_only_context_code: str` — Reference-only dependency code +- `testgen_context: CodeStringsMarkdown` — Context for test generation +- `hashing_code_context: str` / `hashing_code_context_hash: str` — For deduplication +- `helper_functions: list[FunctionSource]` — Helper functions in the writable code +- `preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]]` — Objects that already exist in the code + +### `CodeContextType` enum (`models/models.py`) + +Defines context categories: `READ_WRITABLE`, `READ_ONLY`, `TESTGEN`, `HASHING`. + +## Candidates + +### `OptimizedCandidate` (`models/models.py`) + +A generated code variant: +- `source_code: CodeStringsMarkdown` — The optimized code +- `explanation: str` — LLM explanation of the optimization +- `optimization_id: str` — Unique identifier +- `source: OptimizedCandidateSource` — How it was generated +- `parent_id: str | None` — ID of parent candidate (for refinements/repairs) +- `model: str | None` — Which LLM model generated it + +### `OptimizedCandidateSource` enum (`models/models.py`) + +How a candidate was generated: `OPTIMIZE`, `OPTIMIZE_LP` (line profiler), `REFINE`, `REPAIR`, `ADAPTIVE`, `JIT_REWRITE`. + +### `CandidateEvaluationContext` (`models/models.py`) + +Tracks state during candidate evaluation: +- `speedup_ratios` / `optimized_runtimes` / `is_correct` — Per-candidate results +- `ast_code_to_id` — Deduplication map (normalized AST → first seen candidate) +- `valid_optimizations` — Candidates that passed all checks + +Key methods: `record_failed_candidate()`, `record_successful_candidate()`, `handle_duplicate_candidate()`, `register_new_candidate()`. + +## Baseline & Results + +### `OriginalCodeBaseline` (`models/models.py`) + +Baseline measurements for the original code: +- `behavior_test_results: TestResults` / `benchmarking_test_results: TestResults` +- `line_profile_results: dict` +- `runtime: int` — Total runtime in nanoseconds +- `coverage_results: Optional[CoverageData]` + +### `BestOptimization` (`models/models.py`) + +The winning candidate after evaluation: +- `candidate: OptimizedCandidate` +- `helper_functions: list[FunctionSource]` +- `code_context: CodeOptimizationContext` +- `runtime: int` +- `winning_behavior_test_results` / `winning_benchmarking_test_results: TestResults` + +## Test Types + +### `TestType` enum (`models/test_type.py`) + +- `EXISTING_UNIT_TEST` (1) — Pre-existing tests from the codebase +- `INSPIRED_REGRESSION` (2) — Tests inspired by existing tests +- `GENERATED_REGRESSION` (3) — AI-generated regression tests +- `REPLAY_TEST` (4) — Tests from recorded benchmark data +- `CONCOLIC_COVERAGE_TEST` (5) — Coverage-guided tests +- `INIT_STATE_TEST` (6) — Class init state verification + +### `TestFile` / `TestFiles` (`models/models.py`) + +`TestFile` represents a single test file with `instrumented_behavior_file_path`, optional `benchmarking_file_path`, `original_file_path`, `test_type`, and `tests_in_file`. + +`TestFiles` is a collection with lookup methods: `get_by_type()`, `get_by_original_file_path()`, `get_test_type_by_instrumented_file_path()`. + +### `TestResults` (`models/models.py`) + +Collection of `FunctionTestInvocation` results with indexed lookup. Key methods: +- `add(invocation)` — Deduplicated insert +- `total_passed_runtime()` — Sum of minimum runtimes per test case (nanoseconds) +- `number_of_loops()` — Max loop index across all results +- `usable_runtime_data_by_test_case()` — Dict of invocation ID → list of runtimes + +## Result Type + +### `Result[L, R]` / `Success` / `Failure` (`either.py`) + +Functional error handling type: +- `Success(value)` — Wraps a successful result +- `Failure(error)` — Wraps an error +- `result.is_successful()` / `result.is_failure()` — Check type +- `result.unwrap()` — Get success value (raises if Failure) +- `result.failure()` — Get failure value (raises if Success) +- `is_successful(result)` — Module-level helper function diff --git a/tiles/codeflash-docs/docs/index.md b/tiles/codeflash-docs/docs/index.md new file mode 100644 index 000000000..930e287eb --- /dev/null +++ b/tiles/codeflash-docs/docs/index.md @@ -0,0 +1,41 @@ +# Codeflash Internal Documentation + +CodeFlash is an AI-powered Python code optimizer that automatically improves code performance while maintaining correctness. It uses LLMs to generate optimization candidates, verifies correctness through test execution, and benchmarks performance improvements. + +## Pipeline Overview + +``` +Discovery → Ranking → Context Extraction → Test Gen + Optimization → Baseline → Candidate Evaluation → PR +``` + +1. **Discovery** (`discovery/`): Find optimizable functions across the codebase using `FunctionVisitor` +2. **Ranking** (`benchmarking/function_ranker.py`): Rank functions by addressable time using trace data +3. **Context** (`context/`): Extract code dependencies — split into read-writable (modifiable) and read-only (reference) +4. **Optimization** (`optimization/`, `api/`): Generate candidates via AI service, runs concurrently with test generation +5. **Verification** (`verification/`): Run candidates against tests via custom pytest plugin, compare outputs +6. **Benchmarking** (`benchmarking/`): Measure performance, select best candidate by speedup +7. **Result** (`result/`, `github/`): Create PR with winning optimization + +## Key Entry Points + +| Task | File | +|------|------| +| CLI arguments & commands | `cli_cmds/cli.py` | +| Optimization orchestration | `optimization/optimizer.py` → `Optimizer.run()` | +| Per-function optimization | `optimization/function_optimizer.py` → `FunctionOptimizer` | +| Function discovery | `discovery/functions_to_optimize.py` | +| Context extraction | `context/code_context_extractor.py` | +| Test execution | `verification/test_runner.py`, `verification/pytest_plugin.py` | +| Performance ranking | `benchmarking/function_ranker.py` | +| Domain types | `models/models.py`, `models/function_types.py` | +| AI service | `api/aiservice.py` → `AiServiceClient` | +| Configuration | `code_utils/config_consts.py` | + +## Documentation Pages + +- [Domain Types](domain-types.md) — Core data types and their relationships +- [Optimization Pipeline](optimization-pipeline.md) — Step-by-step data flow through the pipeline +- [Context Extraction](context-extraction.md) — How code context is extracted and token-limited +- [Verification](verification.md) — Test execution, pytest plugin, deterministic patches +- [AI Service](ai-service.md) — AI service client endpoints and request types +- [Configuration](configuration.md) — Config schema, effort levels, thresholds diff --git a/tiles/codeflash-docs/docs/optimization-pipeline.md b/tiles/codeflash-docs/docs/optimization-pipeline.md new file mode 100644 index 000000000..9a3879ccc --- /dev/null +++ b/tiles/codeflash-docs/docs/optimization-pipeline.md @@ -0,0 +1,84 @@ +# Optimization Pipeline + +Step-by-step data flow from function discovery to PR creation. + +## 1. Entry Point: `Optimizer.run()` (`optimization/optimizer.py`) + +The `Optimizer` class is initialized with CLI args and creates: +- `TestConfig` with test roots, project root, pytest command +- `AiServiceClient` for AI service communication +- Optional `LocalAiServiceClient` for experiments + +`run()` orchestrates the full pipeline: discovers functions, optionally ranks them, then optimizes each in turn. + +## 2. Function Discovery (`discovery/functions_to_optimize.py`) + +`FunctionVisitor` traverses source files to find optimizable functions, producing `FunctionToOptimize` instances. Filters include: +- Skipping functions that are too small or trivial +- Skipping previously optimized functions (via `was_function_previously_optimized()`) +- Applying user-configured include/exclude patterns + +## 3. Function Ranking (`benchmarking/function_ranker.py`) + +When trace data is available, `FunctionRanker` ranks functions by **addressable time** — the time a function spends that could be optimized (own time + callee time / call count). Functions below `DEFAULT_IMPORTANCE_THRESHOLD=0.001` are skipped. + +## 4. Per-Function Optimization: `FunctionOptimizer` (`optimization/function_optimizer.py`) + +For each function, `FunctionOptimizer.optimize_function()` runs the full optimization loop: + +### 4a. Context Extraction (`context/code_context_extractor.py`) + +Extracts `CodeOptimizationContext` containing: +- `read_writable_code` — Code the LLM can modify (the function + helpers) +- `read_only_context_code` — Dependency code for reference only +- `testgen_context` — Context for test generation (may include imported class definitions) + +Token limits are enforced: `OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000` and `TESTGEN_CONTEXT_TOKEN_LIMIT=16000`. Functions exceeding these are rejected. + +### 4b. Concurrent Test Generation + LLM Optimization + +These run in parallel using `concurrent.futures`: +- **Test generation**: Generates regression tests from the function context +- **LLM optimization**: Sends `read_writable_code.markdown` + `read_only_context_code` to the AI service + +The number of candidates depends on effort level (see Configuration docs). + +### 4c. Candidate Evaluation + +For each `OptimizedCandidate`: + +1. **Deduplication**: Normalize code AST and check against `CandidateEvaluationContext.ast_code_to_id`. If duplicate, copy results from previous evaluation. + +2. **Code replacement**: Replace the original function with the candidate using `replace_function_definitions_in_module()`. + +3. **Behavioral testing**: Run instrumented tests in subprocess. The custom pytest plugin applies deterministic patches. Compare return values, stdout, and pass/fail status against the original baseline. + +4. **Benchmarking**: If behavior matches, run performance tests with looping (`TOTAL_LOOPING_TIME=10s`). Calculate speedup ratio. + +5. **Validation**: Candidate must beat `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup) and pass stability checks. + +### 4d. Refinement & Repair + +- **Repair**: If fewer than `MIN_CORRECT_CANDIDATES=2` pass, failed candidates can be repaired via `AIServiceCodeRepairRequest` (sends test diffs to LLM). +- **Refinement**: Top valid candidates are refined via `AIServiceRefinerRequest` (sends runtime data, line profiler results). +- **Adaptive**: At HIGH effort, additional adaptive optimization rounds via `AIServiceAdaptiveOptimizeRequest`. + +### 4e. Best Candidate Selection + +The winning candidate is selected by: +1. Highest speedup ratio +2. For tied speedups, shortest diff length from original +3. Refinement candidates use weighted ranking: `(2 * runtime_rank + 1 * diff_rank)` + +Result is a `BestOptimization` with the candidate, context, test results, and runtime. + +## 5. PR Creation (`github/`) + +If a winning candidate is found, a PR is created with: +- The optimized code diff +- Performance benchmark details +- Explanation from the LLM + +## Worktree Mode + +When `--worktree` is enabled, optimization runs in an isolated git worktree (`code_utils/git_worktree_utils.py`). This allows parallel optimization without affecting the working tree. Changes are captured as patch files. diff --git a/tiles/codeflash-docs/docs/verification.md b/tiles/codeflash-docs/docs/verification.md new file mode 100644 index 000000000..2a84f9340 --- /dev/null +++ b/tiles/codeflash-docs/docs/verification.md @@ -0,0 +1,93 @@ +# Verification + +How codeflash verifies candidate correctness and measures performance. + +## Test Execution Architecture + +Tests are executed in a **subprocess** to isolate the test environment from the main codeflash process. The test runner (`verification/test_runner.py`) invokes pytest (or Jest for JS/TS) with specific plugin configurations. + +### Plugin Blocklists + +- **Behavioral tests**: Block `benchmark`, `codspeed`, `xdist`, `sugar` +- **Benchmarking tests**: Block `codspeed`, `cov`, `benchmark`, `profiling`, `xdist`, `sugar` + +These are defined as `BEHAVIORAL_BLOCKLISTED_PLUGINS` and `BENCHMARKING_BLOCKLISTED_PLUGINS` in `verification/test_runner.py`. + +## Custom Pytest Plugin (`verification/pytest_plugin.py`) + +The plugin is loaded into the test subprocess and provides: + +### Deterministic Patches + +`_apply_deterministic_patches()` replaces non-deterministic functions with fixed values to ensure reproducible test output: + +| Module | Function | Fixed Value | +|--------|----------|-------------| +| `time` | `time()` | `1761717605.108106` | +| `time` | `perf_counter()` | Incrementing by 1ms per call | +| `datetime` | `datetime.now()` | `2021-01-01 02:05:10 UTC` | +| `datetime` | `datetime.utcnow()` | `2021-01-01 02:05:10 UTC` | +| `uuid` | `uuid4()` / `uuid1()` | `12345678-1234-5678-9abc-123456789012` | +| `random` | `random()` | `0.123456789` (seeded with 42) | +| `os` | `urandom(n)` | `b"\x42" * n` | +| `numpy.random` | seed | `42` | + +Patches call the original function first to maintain performance characteristics (same call overhead). + +### Timing Markers + +Test results include timing markers in stdout: `!######:######!` + +The pattern `_TIMING_MARKER_PATTERN` extracts timing data for calculating function utilization fraction. + +### Loop Stability + +Performance benchmarking uses configurable stability thresholds: +- `STABILITY_WINDOW_SIZE = 0.35` (35% of total iterations) +- `STABILITY_CENTER_TOLERANCE = 0.0025` (±0.25% around median) +- `STABILITY_SPREAD_TOLERANCE = 0.0025` (0.25% window spread) + +### Memory Limits (Linux) + +On Linux, the plugin sets `RLIMIT_AS` to 85% of total system memory (RAM + swap) to prevent OOM kills. + +## Test Result Processing + +### `TestResults` (`models/models.py`) + +Collects `FunctionTestInvocation` results with: +- Deduplicated insertion via `unique_invocation_loop_id` +- `total_passed_runtime()` — Sum of minimum runtimes per test case (nanoseconds) +- `number_of_loops()` — Max loop index +- `usable_runtime_data_by_test_case()` — Grouped timing data + +### `FunctionTestInvocation` + +Each invocation records: +- `loop_index` — Iteration number (starts at 1) +- `id: InvocationId` — Fully qualified test identifier +- `did_pass: bool` — Pass/fail status +- `runtime: Optional[int]` — Time in nanoseconds +- `return_value: Optional[object]` — Captured return value +- `test_type: TestType` — Which test category + +### Behavioral vs Performance Testing + +1. **Behavioral**: Runs with `TestingMode.BEHAVIOR`. Compares return values and stdout between original and candidate. Any difference = candidate rejected. +2. **Performance**: Runs with `TestingMode.PERFORMANCE`. Loops for `TOTAL_LOOPING_TIME=10s` to get stable timing. Calculates speedup ratio. +3. **Line Profile**: Runs with `TestingMode.LINE_PROFILE`. Collects per-line timing data for refinement. + +## Test Types + +| TestType | Value | Description | +|----------|-------|-------------| +| `EXISTING_UNIT_TEST` | 1 | Pre-existing tests from the codebase | +| `INSPIRED_REGRESSION` | 2 | Tests inspired by existing tests | +| `GENERATED_REGRESSION` | 3 | AI-generated regression tests | +| `REPLAY_TEST` | 4 | Tests from recorded benchmark data | +| `CONCOLIC_COVERAGE_TEST` | 5 | Coverage-guided tests | +| `INIT_STATE_TEST` | 6 | Class init state verification | + +## Coverage + +Coverage is measured via `CoverageData` with a threshold of `COVERAGE_THRESHOLD=60.0%`. Low coverage may affect confidence in the optimization's correctness. diff --git a/tiles/codeflash-docs/tile.json b/tiles/codeflash-docs/tile.json new file mode 100644 index 000000000..8d18aa129 --- /dev/null +++ b/tiles/codeflash-docs/tile.json @@ -0,0 +1,7 @@ +{ + "name": "codeflash/codeflash-docs", + "version": "0.1.0", + "summary": "Internal documentation for the codeflash optimization engine", + "private": true, + "docs": "docs/index.md" +} diff --git a/tiles/codeflash-rules/rules/architecture.md b/tiles/codeflash-rules/rules/architecture.md new file mode 100644 index 000000000..3aaf78507 --- /dev/null +++ b/tiles/codeflash-rules/rules/architecture.md @@ -0,0 +1,45 @@ +# Architecture + +``` +codeflash/ +├── main.py # CLI entry point +├── cli_cmds/ # Command handling, console output (Rich) +├── discovery/ # Find optimizable functions +├── context/ # Extract code dependencies and imports +├── optimization/ # Generate optimized code via AI +│ ├── optimizer.py # Main optimization orchestration +│ └── function_optimizer.py # Per-function optimization logic +├── verification/ # Run deterministic tests (pytest plugin) +├── benchmarking/ # Performance measurement +├── github/ # PR creation +├── api/ # AI service communication +├── code_utils/ # Code parsing, git utilities +├── models/ # Pydantic models and types +├── languages/ # Multi-language support (Python, JavaScript/TypeScript) +├── setup/ # Config schema, auto-detection, first-run experience +├── picklepatch/ # Serialization/deserialization utilities +├── tracing/ # Function call tracing +├── tracer.py # Root-level tracer entry point for profiling +├── lsp/ # IDE integration (Language Server Protocol) +├── telemetry/ # Sentry, PostHog +├── either.py # Functional Result type for error handling +├── result/ # Result types and handling +└── version.py # Version information +``` + +## Key Entry Points + +| Task | Start here | +|------|------------| +| CLI arguments & commands | `cli_cmds/cli.py` | +| Optimization orchestration | `optimization/optimizer.py` → `Optimizer.run()` | +| Per-function optimization | `optimization/function_optimizer.py` → `FunctionOptimizer` | +| Function discovery | `discovery/functions_to_optimize.py` | +| Context extraction | `context/code_context_extractor.py` | +| Test execution | `verification/test_runner.py`, `verification/pytest_plugin.py` | +| Performance ranking | `benchmarking/function_ranker.py` | +| Domain types | `models/models.py`, `models/function_types.py` | +| Result handling | `either.py` (`Result`, `Success`, `Failure`, `is_successful`) | +| AI service communication | `api/aiservice.py` → `AiServiceClient` | +| Configuration constants | `code_utils/config_consts.py` | +| Language support | `languages/registry.py` → `get_language_support()` | diff --git a/tiles/codeflash-rules/rules/code-style.md b/tiles/codeflash-rules/rules/code-style.md new file mode 100644 index 000000000..2a2fbdf6b --- /dev/null +++ b/tiles/codeflash-rules/rules/code-style.md @@ -0,0 +1,11 @@ +# Code Style + +- **Line length**: 120 characters +- **Python**: 3.9+ syntax (use `from __future__ import annotations` for type hints) +- **Package management**: Always use `uv`, never `pip` — run commands via `uv run` +- **Tooling**: Ruff for linting/formatting, mypy strict mode, prek for pre-commit checks (`uv run prek run`) +- **Comments**: Minimal — only explain "why", not "what" +- **Docstrings**: Do not add unless explicitly requested +- **Naming**: NEVER use leading underscores (`_function_name`) — Python has no true private functions, use public names +- **Paths**: Always use absolute `Path` objects, handle encoding explicitly (UTF-8) +- **Source transforms**: Use `libcst` for code modification/transformation to preserve formatting; `ast` is acceptable for read-only analysis and parsing diff --git a/tiles/codeflash-rules/rules/git-conventions.md b/tiles/codeflash-rules/rules/git-conventions.md new file mode 100644 index 000000000..1835dfdca --- /dev/null +++ b/tiles/codeflash-rules/rules/git-conventions.md @@ -0,0 +1,9 @@ +# Git Conventions + +- **Always create a new branch from `main`** — never commit directly to `main` or reuse an existing feature branch for unrelated changes +- Use conventional commit format: `fix:`, `feat:`, `refactor:`, `docs:`, `test:`, `chore:` +- Keep commits atomic — one logical change per commit +- Commit message body should be concise (1-2 sentences max) +- PR titles should also use conventional format +- Branch naming: `cf-#-title` (lowercase, hyphenated) where `#` is the Linear issue number +- If related to a Linear issue, include `CF-#` in the PR body diff --git a/tiles/codeflash-rules/rules/language-rules.md b/tiles/codeflash-rules/rules/language-rules.md new file mode 100644 index 000000000..3b045a4f4 --- /dev/null +++ b/tiles/codeflash-rules/rules/language-rules.md @@ -0,0 +1,9 @@ +# Language Support Rules + +- Current language is a module-level singleton in `languages/current.py` — use `set_current_language()` / `current_language()`, never pass language as a parameter through call chains +- Use `get_language_support(identifier)` from `languages/registry.py` to get a `LanguageSupport` instance — accepts `Path`, `Language` enum, or string; never import language classes directly +- New language support classes must use the `@register_language` decorator to register with the extension and language registries +- `languages/__init__.py` uses `__getattr__` for lazy imports to avoid circular dependencies — follow this pattern when adding new exports +- `is_javascript()` returns `True` for both JavaScript and TypeScript +- Language modules are lazily imported on first `get_language_support()` call via `_ensure_languages_registered()` — the `@register_language` decorator fires on import and populates `_EXTENSION_REGISTRY` and `_LANGUAGE_REGISTRY` +- `LanguageSupport` instances are cached in `_SUPPORT_CACHE` — use `clear_cache()` only in tests diff --git a/tiles/codeflash-rules/rules/optimization-patterns.md b/tiles/codeflash-rules/rules/optimization-patterns.md new file mode 100644 index 000000000..7b879d227 --- /dev/null +++ b/tiles/codeflash-rules/rules/optimization-patterns.md @@ -0,0 +1,11 @@ +# Optimization Pipeline Patterns + +- All major operations return `Result[SuccessType, ErrorType]` — construct with `Success(value)` / `Failure(error)`, check with `is_successful()` before calling `unwrap()` +- Code context has token limits (`OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000`, `TESTGEN_CONTEXT_TOKEN_LIMIT=16000` in `code_utils/config_consts.py`) — exceeding them rejects the function +- `read_writable_code` (modifiable code) can span multiple files; `read_only_context_code` is reference-only dependency code +- Code is serialized as markdown code blocks: `` ```language:filepath\ncode\n``` `` — see `CodeStringsMarkdown` in `models/models.py` +- Candidates form a forest (DAG): refinements/repairs reference `parent_id` on previous candidates via `OptimizedCandidateSource` (OPTIMIZE, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE) +- Test generation and optimization run concurrently — coordinate through `CandidateEvaluationContext` +- Generated tests are instrumented with `codeflash_capture.py` to record return values and traces +- Minimum improvement threshold is 5% (`MIN_IMPROVEMENT_THRESHOLD=0.05`) — candidates below this are rejected +- Stability thresholds: `STABILITY_WINDOW_SIZE=0.35`, `STABILITY_CENTER_TOLERANCE=0.0025`, `STABILITY_SPREAD_TOLERANCE=0.0025` diff --git a/tiles/codeflash-rules/rules/testing-rules.md b/tiles/codeflash-rules/rules/testing-rules.md new file mode 100644 index 000000000..780b48d60 --- /dev/null +++ b/tiles/codeflash-rules/rules/testing-rules.md @@ -0,0 +1,13 @@ +# Testing Rules + +- Code context extraction and replacement tests must assert full string equality — no substring matching +- Use pytest's `tmp_path` fixture for temp directories (it's a `Path` object) +- Write temp files inside `tmp_path`, never use `NamedTemporaryFile` (causes Windows file contention) +- Always call `.resolve()` on Path objects to ensure absolute paths and resolve symlinks +- Use `.as_posix()` when converting resolved paths to strings (normalizes to forward slashes) +- Any new feature or bug fix that can be tested automatically must have test cases +- If changes affect existing test expectations, update the tests accordingly — tests must always pass after changes +- The pytest plugin patches `time`, `random`, `uuid`, `datetime`, `os.urandom`, and `numpy.random` for deterministic test execution — never assume real randomness or real time in verification tests +- `conftest.py` uses an autouse fixture that calls `reset_current_language()` — tests always start with Python as the default language +- Test types are defined by the `TestType` enum: `EXISTING_UNIT_TEST`, `INSPIRED_REGRESSION`, `GENERATED_REGRESSION`, `REPLAY_TEST`, `CONCOLIC_COVERAGE_TEST`, `INIT_STATE_TEST` +- Verification runs tests in a subprocess using a custom pytest plugin (`verification/pytest_plugin.py`) — behavioral tests use blocklisted plugins (`benchmark`, `codspeed`, `xdist`, `sugar`), benchmarking tests additionally block `cov` and `profiling` diff --git a/tiles/codeflash-rules/tile.json b/tiles/codeflash-rules/tile.json new file mode 100644 index 000000000..a286ba09b --- /dev/null +++ b/tiles/codeflash-rules/tile.json @@ -0,0 +1,26 @@ +{ + "name": "codeflash/codeflash-rules", + "version": "0.1.0", + "summary": "Coding standards and conventions for the codeflash codebase", + "private": true, + "rules": { + "code-style": { + "rules": "rules/code-style.md" + }, + "architecture": { + "rules": "rules/architecture.md" + }, + "optimization-patterns": { + "rules": "rules/optimization-patterns.md" + }, + "git-conventions": { + "rules": "rules/git-conventions.md" + }, + "testing-rules": { + "rules": "rules/testing-rules.md" + }, + "language-rules": { + "rules": "rules/language-rules.md" + } + } +} diff --git a/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md b/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md new file mode 100644 index 000000000..f5fa89405 --- /dev/null +++ b/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md @@ -0,0 +1,96 @@ +--- +name: add-codeflash-feature +description: Step-by-step workflow for adding a new feature to the codeflash codebase +--- + +# Add Codeflash Feature + +Use this workflow when implementing a new feature in the codeflash codebase. + +## Step 1: Identify Target Modules + +Determine which module(s) need modification based on the feature: + +| Feature area | Primary module | Key files | +|-------------|----------------|-----------| +| New optimization strategy | `optimization/` | `function_optimizer.py`, `optimizer.py` | +| New test type | `verification/`, `models/` | `test_runner.py`, `pytest_plugin.py`, `test_type.py` | +| New AI service endpoint | `api/` | `aiservice.py` | +| New language support | `languages/` | Create new `languages//support.py` | +| Context extraction change | `context/` | `code_context_extractor.py` | +| New CLI command | `cli_cmds/` | `cli.py` | +| New config option | `setup/`, `code_utils/` | `config_consts.py`, `setup/detector.py` | +| Discovery filter | `discovery/` | `functions_to_optimize.py` | +| PR/result changes | `github/`, `result/` | Relevant handlers | + +## Step 2: Follow Result Type Pattern + +Use the `Result[L, R]` type from `either.py` for error handling in pipeline operations: + +```python +from codeflash.either import Success, Failure, is_successful + +def my_operation() -> Result[str, MyResultType]: + if error_condition: + return Failure("descriptive error message") + return Success(result_value) + +# Usage: +result = my_operation() +if not is_successful(result): + logger.error(result.failure()) + return +value = result.unwrap() +``` + +## Step 3: Add Configuration Constants + +If the feature needs configurable thresholds or limits: + +1. Add constants to `code_utils/config_consts.py` +2. If effort-dependent, add to `EFFORT_VALUES` dict with values for `LOW`, `MEDIUM`, `HIGH` +3. Add a corresponding `EffortKeys` enum entry +4. Access via `get_effort_value(EffortKeys.MY_KEY, effort_level)` + +## Step 4: Add Domain Types + +If new data structures are needed: + +1. Add Pydantic models or frozen dataclasses to `models/models.py` or `models/function_types.py` +2. Use `@dataclass(frozen=True)` for immutable data +3. Use `BaseModel` for models that need serialization +4. Keep `function_types.py` dependency-free (no imports from other codeflash modules) + +## Step 5: Write Tests + +Follow existing test patterns: + +1. Create test files in the `tests/` directory mirroring the source structure +2. Use pytest's `tmp_path` fixture for temp directories +3. Always call `.resolve()` on Path objects +4. Assert full string equality for code context tests — no substring matching +5. Remember the pytest plugin patches `time`, `random`, `uuid`, `datetime` — don't rely on real values + +## Step 6: Run Quality Checks + +Run all validation before committing: + +```bash +# Pre-commit checks (ruff format + lint) +uv run prek run + +# Type checking +uv run mypy codeflash/ + +# Run relevant tests +uv run pytest tests/path/to/relevant/tests -x +``` + +## Step 7: Language Support Considerations + +If the feature needs to work across languages: + +1. Check if the feature uses language-specific APIs — use `get_language_support(identifier)` from `languages/registry.py` +2. Current language is a singleton: `set_current_language()` / `current_language()` from `languages/current.py` +3. Use `is_python()` / `is_javascript()` guards for language-specific branches +4. New language support classes must use `@register_language` decorator diff --git a/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md b/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md new file mode 100644 index 000000000..d0740663e --- /dev/null +++ b/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md @@ -0,0 +1,95 @@ +--- +name: debug-optimization-failure +description: Debug why a codeflash optimization failed at any pipeline stage +--- + +# Debug Optimization Failure + +Use this workflow when an optimization run fails or produces no results. Work through the stages sequentially — stop at the first failure found. + +## Step 1: Check Function Discovery + +Determine if the function was discovered by `FunctionVisitor`. + +1. Look at the discovery output or logs for the function name +2. Check `discovery/functions_to_optimize.py` — the `FunctionVisitor` filters out: + - Functions that are too small or trivial + - Functions matching exclude patterns in config + - Functions already optimized (`was_function_previously_optimized()`) +3. Verify the function file is under the configured `module-root` + +**If not discovered**: Check config patterns, file location, and function size. + +## Step 2: Check Ranking + +If trace data is used, check if the function was ranked high enough. + +1. Look at `benchmarking/function_ranker.py` output +2. The function's **addressable time** must exceed `DEFAULT_IMPORTANCE_THRESHOLD=0.001` +3. Addressable time = own time + callee time / call count + +**If ranked too low**: The function doesn't spend enough time to be worth optimizing. + +## Step 3: Check Context Token Limits + +Verify the function's context fits within token limits. + +1. Check `OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000` and `TESTGEN_CONTEXT_TOKEN_LIMIT=16000` in `code_utils/config_consts.py` +2. Token counting is done by `encoded_tokens_len()` in `code_utils/code_utils.py` +3. Large helper function chains or deep dependency trees can blow the limit + +**If context too large**: The function has too many dependencies. Consider refactoring to reduce context size. + +## Step 4: Check AI Service Response + +Verify the AI service returned valid candidates. + +1. Check logs for `AiServiceClient` request/response +2. Look for HTTP errors (non-200 status codes) +3. Verify `_get_valid_candidates()` parsed the response — empty `code_strings` means invalid markdown code blocks +4. Check if all candidates were filtered out during parsing + +**If no candidates returned**: Check API key, network connectivity, and service status. + +## Step 5: Check Test Failures + +Determine if candidates failed behavioral or benchmark tests. + +1. **Behavioral failures**: Compare return values, stdout, pass/fail status between original baseline and candidate + - Check `TestDiffScope`: `RETURN_VALUE`, `STDOUT`, `DID_PASS` + - Look at JUnit XML results for specific test failures +2. **Benchmark failures**: Check if candidate met `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup) +3. **Stability failures**: Check if timing was stable within `STABILITY_WINDOW_SIZE=0.35` + +**If behavioral failure**: The optimization changed the function's behavior. Check test diffs for specific mismatches. +**If benchmark failure**: The optimization didn't provide enough speedup. + +## Step 6: Check Deduplication + +Verify candidates weren't deduplicated away. + +1. `CandidateEvaluationContext.ast_code_to_id` tracks normalized code → candidate mapping +2. `normalize_code()` from `code_utils/deduplicate_code.py` normalizes AST for comparison +3. If all candidates normalize to the same code, only one is actually tested + +**If all duplicates**: The LLM generated the same optimization multiple times. Try higher effort level. + +## Step 7: Check Repair/Refinement + +If initial candidates failed, check repair and refinement stages. + +1. Repair only runs if fewer than `MIN_CORRECT_CANDIDATES=2` passed +2. Repair sends `AIServiceCodeRepairRequest` with test diffs +3. Check `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` — if too many tests failed, repair is skipped +4. Refinement only runs on top valid candidates + +**If repair also failed**: The optimization approach may not work for this function. + +## Key Files to Check + +- `optimization/function_optimizer.py` — Main optimization loop, `determine_best_candidate()` +- `verification/test_runner.py` — Test execution +- `api/aiservice.py` — AI service communication +- `code_utils/config_consts.py` — Thresholds +- `context/code_context_extractor.py` — Context extraction +- `models/models.py` — `CandidateEvaluationContext`, `TestResults` diff --git a/tiles/codeflash-skills/tile.json b/tiles/codeflash-skills/tile.json new file mode 100644 index 000000000..0dee84ce6 --- /dev/null +++ b/tiles/codeflash-skills/tile.json @@ -0,0 +1,14 @@ +{ + "name": "codeflash/codeflash-skills", + "version": "0.1.0", + "summary": "Procedural workflows for developing and debugging codeflash", + "private": true, + "skills": { + "debug-optimization-failure": { + "path": "skills/debug-optimization-failure/SKILL.md" + }, + "add-codeflash-feature": { + "path": "skills/add-codeflash-feature/SKILL.md" + } + } +} From 18ad00be59db19e67c2ae3748aa8a225ed2cb0dc Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 14 Feb 2026 21:07:24 -0500 Subject: [PATCH 2/5] chore: improve skills to 100% review score and bump to v0.2.0 - Add trigger hints and code snippets to both skills - Add checkpoints after each step - Extract module reference and troubleshooting into linked files - Bump codeflash-skills tile to 0.2.0 --- tessl.json | 2 +- .../add-codeflash-feature/MODULE_REFERENCE.md | 13 ++ .../skills/add-codeflash-feature/SKILL.md | 102 +++++++++++---- .../add-codeflash-feature/TROUBLESHOOTING.md | 9 ++ .../debug-optimization-failure/SKILL.md | 117 +++++++++++------- tiles/codeflash-skills/tile.json | 2 +- 6 files changed, 173 insertions(+), 72 deletions(-) create mode 100644 tiles/codeflash-skills/skills/add-codeflash-feature/MODULE_REFERENCE.md create mode 100644 tiles/codeflash-skills/skills/add-codeflash-feature/TROUBLESHOOTING.md diff --git a/tessl.json b/tessl.json index 7061e2c97..2adf295be 100644 --- a/tessl.json +++ b/tessl.json @@ -71,7 +71,7 @@ "version": "0.1.0" }, "codeflash/codeflash-skills": { - "version": "0.1.0" + "version": "0.2.0" } } } diff --git a/tiles/codeflash-skills/skills/add-codeflash-feature/MODULE_REFERENCE.md b/tiles/codeflash-skills/skills/add-codeflash-feature/MODULE_REFERENCE.md new file mode 100644 index 000000000..9012fb294 --- /dev/null +++ b/tiles/codeflash-skills/skills/add-codeflash-feature/MODULE_REFERENCE.md @@ -0,0 +1,13 @@ +# Module Reference + +| Feature area | Primary module | Key files | +|-------------|----------------|-----------| +| New optimization strategy | `optimization/` | `function_optimizer.py`, `optimizer.py` | +| New test type | `verification/`, `models/` | `test_runner.py`, `pytest_plugin.py`, `test_type.py` | +| New AI service endpoint | `api/` | `aiservice.py` | +| New language support | `languages/` | Create new `languages//support.py` | +| Context extraction change | `context/` | `code_context_extractor.py` | +| New CLI command | `cli_cmds/` | `cli.py` | +| New config option | `setup/`, `code_utils/` | `config_consts.py`, `setup/detector.py` | +| Discovery filter | `discovery/` | `functions_to_optimize.py` | +| PR/result changes | `github/`, `result/` | Relevant handlers | diff --git a/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md b/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md index f5fa89405..f61abfe83 100644 --- a/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md +++ b/tiles/codeflash-skills/skills/add-codeflash-feature/SKILL.md @@ -1,27 +1,23 @@ --- name: add-codeflash-feature -description: Step-by-step workflow for adding a new feature to the codeflash codebase +description: > + Guides implementation of new functionality in the codeflash optimization engine. + Use when adding a feature, building new functionality, implementing a new + optimization strategy, adding a language backend, creating an API endpoint, + extending the verification pipeline, or developing any new codeflash capability. + Covers module identification, Result type patterns, config, types, tests, and + quality checks. --- # Add Codeflash Feature -Use this workflow when implementing a new feature in the codeflash codebase. +Use this workflow when implementing new functionality in the codeflash codebase — new optimization strategies, language backends, API endpoints, CLI commands, config options, or pipeline extensions. ## Step 1: Identify Target Modules -Determine which module(s) need modification based on the feature: +Determine which module(s) need modification. See [MODULE_REFERENCE.md](MODULE_REFERENCE.md) for the full mapping of feature areas to modules and key files. -| Feature area | Primary module | Key files | -|-------------|----------------|-----------| -| New optimization strategy | `optimization/` | `function_optimizer.py`, `optimizer.py` | -| New test type | `verification/`, `models/` | `test_runner.py`, `pytest_plugin.py`, `test_type.py` | -| New AI service endpoint | `api/` | `aiservice.py` | -| New language support | `languages/` | Create new `languages//support.py` | -| Context extraction change | `context/` | `code_context_extractor.py` | -| New CLI command | `cli_cmds/` | `cli.py` | -| New config option | `setup/`, `code_utils/` | `config_consts.py`, `setup/detector.py` | -| Discovery filter | `discovery/` | `functions_to_optimize.py` | -| PR/result changes | `github/`, `result/` | Relevant handlers | +**Checkpoint**: Read the target files and understand existing patterns before writing any code. Look for similar features already implemented as reference. ## Step 2: Follow Result Type Pattern @@ -43,33 +39,76 @@ if not is_successful(result): value = result.unwrap() ``` +**Checkpoint**: Verify your function signatures match the `Result` pattern used in surrounding code. Not all functions use `Result` — match the convention of the module you're modifying. + ## Step 3: Add Configuration Constants If the feature needs configurable thresholds or limits: 1. Add constants to `code_utils/config_consts.py` -2. If effort-dependent, add to `EFFORT_VALUES` dict with values for `LOW`, `MEDIUM`, `HIGH` -3. Add a corresponding `EffortKeys` enum entry -4. Access via `get_effort_value(EffortKeys.MY_KEY, effort_level)` +2. If effort-dependent, add to `EFFORT_VALUES` dict with values for all three levels: + ```python + # In config_consts.py: + class EffortKeys(str, Enum): + MY_NEW_KEY = "MY_NEW_KEY" + + EFFORT_VALUES: dict[str, dict[EffortLevel, Any]] = { + # ... existing entries ... + EffortKeys.MY_NEW_KEY.value: { + EffortLevel.LOW: 1, + EffortLevel.MEDIUM: 3, + EffortLevel.HIGH: 5, + }, + } + ``` +3. Access via `get_effort_value(EffortKeys.MY_NEW_KEY, effort_level)` + +**Checkpoint**: Skip this step if the feature doesn't need configuration. Not every feature requires new constants. ## Step 4: Add Domain Types If new data structures are needed: 1. Add Pydantic models or frozen dataclasses to `models/models.py` or `models/function_types.py` -2. Use `@dataclass(frozen=True)` for immutable data -3. Use `BaseModel` for models that need serialization -4. Keep `function_types.py` dependency-free (no imports from other codeflash modules) +2. Use `@dataclass(frozen=True)` for immutable data, `BaseModel` for models that need serialization +3. Keep `function_types.py` dependency-free — no imports from other codeflash modules + +Example following existing patterns: +```python +# In models/models.py: +@dataclass(frozen=True) +class MyNewType: + name: str + value: int + source: OptimizedCandidateSource + +# For serializable models: +class MyNewModel(BaseModel): + items: list[MyNewType] = [] +``` + +**Checkpoint**: Skip this step if you can reuse existing types. Check `models/models.py` for types that already fit your needs. ## Step 5: Write Tests Follow existing test patterns: -1. Create test files in the `tests/` directory mirroring the source structure -2. Use pytest's `tmp_path` fixture for temp directories -3. Always call `.resolve()` on Path objects +1. Create test files in `tests/` mirroring the source structure (e.g., `tests/test_optimization/test_my_feature.py`) +2. Use pytest's `tmp_path` fixture for temp directories — never `NamedTemporaryFile` +3. Always call `.resolve()` on Path objects and `.as_posix()` for string conversion 4. Assert full string equality for code context tests — no substring matching -5. Remember the pytest plugin patches `time`, `random`, `uuid`, `datetime` — don't rely on real values +5. The pytest plugin patches `time`, `random`, `uuid`, `datetime` — never rely on real values in verification tests + +```python +def test_my_feature(tmp_path: Path) -> None: + test_file = tmp_path / "test_module.py" + test_file.write_text("def foo(): return 1", encoding="utf-8") + result = my_operation(test_file.resolve()) + assert is_successful(result) + assert result.unwrap() == expected_value +``` + +**Checkpoint**: Run the new tests in isolation before proceeding: `uv run pytest tests/path/to/test_file.py -x` ## Step 6: Run Quality Checks @@ -86,11 +125,22 @@ uv run mypy codeflash/ uv run pytest tests/path/to/relevant/tests -x ``` +**If checks fail**: +- `prek run` failures: Fix formatting/lint issues reported by ruff, then re-run +- `mypy` failures: Fix type errors — common issues are missing return types, wrong `Optional` usage, or missing imports in `TYPE_CHECKING` block +- Test failures: Fix the failing test or the implementation, then re-run + ## Step 7: Language Support Considerations If the feature needs to work across languages: -1. Check if the feature uses language-specific APIs — use `get_language_support(identifier)` from `languages/registry.py` +1. Use `get_language_support(identifier)` from `languages/registry.py` — never import language classes directly 2. Current language is a singleton: `set_current_language()` / `current_language()` from `languages/current.py` 3. Use `is_python()` / `is_javascript()` guards for language-specific branches -4. New language support classes must use `@register_language` decorator +4. New language support classes must use `@register_language` decorator and be instantiable without arguments + +**Checkpoint**: Skip this step if the feature is Python-only. Most features don't need multi-language support. + +## Troubleshooting + +If you run into issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common problems and fixes (circular imports, `UnsupportedLanguageError`, CI path failures, Pydantic validation errors, token limit exceeded). diff --git a/tiles/codeflash-skills/skills/add-codeflash-feature/TROUBLESHOOTING.md b/tiles/codeflash-skills/skills/add-codeflash-feature/TROUBLESHOOTING.md new file mode 100644 index 000000000..6c56f8d0b --- /dev/null +++ b/tiles/codeflash-skills/skills/add-codeflash-feature/TROUBLESHOOTING.md @@ -0,0 +1,9 @@ +# Troubleshooting + +| Problem | Likely cause | Fix | +|---------|-------------|-----| +| Circular import at startup | Importing from `models/` in a module loaded early | Move import into `TYPE_CHECKING` block or use lazy import | +| `UnsupportedLanguageError` | Language modules not registered yet | Call `_ensure_languages_registered()` or use `get_language_support()` which does it automatically | +| Tests pass locally but fail in CI | Path differences (absolute vs relative) | Always use `.resolve()` on Path objects | +| `ValidationError` from Pydantic | Invalid code passed to `CodeString` | Check that generated code passes syntax validation for the target language | +| `encoded_tokens_len` exceeds limit | Context too large | Reduce helper functions or split into read-only vs read-writable | diff --git a/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md b/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md index d0740663e..f85c56641 100644 --- a/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md +++ b/tiles/codeflash-skills/skills/debug-optimization-failure/SKILL.md @@ -1,6 +1,10 @@ --- name: debug-optimization-failure -description: Debug why a codeflash optimization failed at any pipeline stage +description: > + Diagnose why a codeflash optimization produced no results or failed silently. + Use when an optimization run errors out, returns no candidates, or all candidates + are rejected. Walks through discovery, ranking, context limits, AI service, + test verification, deduplication, and repair stages. --- # Debug Optimization Failure @@ -11,85 +15,110 @@ Use this workflow when an optimization run fails or produces no results. Work th Determine if the function was discovered by `FunctionVisitor`. -1. Look at the discovery output or logs for the function name -2. Check `discovery/functions_to_optimize.py` — the `FunctionVisitor` filters out: - - Functions that are too small or trivial - - Functions matching exclude patterns in config - - Functions already optimized (`was_function_previously_optimized()`) -3. Verify the function file is under the configured `module-root` +1. Search logs for the function name in discovery output: + ```python + # In discovery/functions_to_optimize.py, FunctionVisitor filters out: + # - Functions matching exclude patterns in pyproject.toml [tool.codeflash] + # - Functions already optimized (was_function_previously_optimized()) + # - Functions outside the configured module-root + ``` +2. Verify the function file is under the configured `module-root` in `pyproject.toml` +3. Check if the function was previously optimized — look for it in the optimization history -**If not discovered**: Check config patterns, file location, and function size. +**Checkpoint**: If the function doesn't appear in discovery output, fix config patterns or file location before proceeding. ## Step 2: Check Ranking If trace data is used, check if the function was ranked high enough. -1. Look at `benchmarking/function_ranker.py` output -2. The function's **addressable time** must exceed `DEFAULT_IMPORTANCE_THRESHOLD=0.001` -3. Addressable time = own time + callee time / call count +1. Look at `benchmarking/function_ranker.py` output for the function's addressable time +2. The function must exceed `DEFAULT_IMPORTANCE_THRESHOLD=0.001`: + ```python + # Addressable time = own time + callee time / call count + # Grep for the function in ranking output: + # grep -i "function_name" in ranking logs + ``` +3. Functions below the threshold are silently skipped -**If ranked too low**: The function doesn't spend enough time to be worth optimizing. +**Checkpoint**: If ranked too low, the function doesn't spend enough time to be worth optimizing. No fix needed — this is expected. ## Step 3: Check Context Token Limits Verify the function's context fits within token limits. -1. Check `OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000` and `TESTGEN_CONTEXT_TOKEN_LIMIT=16000` in `code_utils/config_consts.py` -2. Token counting is done by `encoded_tokens_len()` in `code_utils/code_utils.py` -3. Large helper function chains or deep dependency trees can blow the limit +1. Check thresholds in `code_utils/config_consts.py`: + ```python + OPTIMIZATION_CONTEXT_TOKEN_LIMIT = 16000 # tokens + TESTGEN_CONTEXT_TOKEN_LIMIT = 16000 # tokens + ``` +2. Token counting uses `encoded_tokens_len()` from `code_utils/code_utils.py` +3. Common causes: large helper function chains, deep dependency trees, large class hierarchies -**If context too large**: The function has too many dependencies. Consider refactoring to reduce context size. +**Checkpoint**: If context exceeds limits, the function is rejected. Consider refactoring to reduce dependencies or splitting large modules. ## Step 4: Check AI Service Response Verify the AI service returned valid candidates. -1. Check logs for `AiServiceClient` request/response -2. Look for HTTP errors (non-200 status codes) -3. Verify `_get_valid_candidates()` parsed the response — empty `code_strings` means invalid markdown code blocks -4. Check if all candidates were filtered out during parsing +1. Look for HTTP errors in logs: + ``` + # Error patterns to search for: + "Error generating optimized candidates" + "Error generating jit rewritten candidate" + "cli-optimize-error-caught" + "cli-optimize-error-response" + ``` +2. Check `_get_valid_candidates()` in `api/aiservice.py` — empty `code_strings` after `CodeStringsMarkdown.parse_markdown_code()` means the LLM returned malformed code blocks +3. Verify API key is valid (`get_codeflash_api_key()`) -**If no candidates returned**: Check API key, network connectivity, and service status. +**Checkpoint**: If no candidates returned, check API key, network, and service status before proceeding. ## Step 5: Check Test Failures Determine if candidates failed behavioral or benchmark tests. -1. **Behavioral failures**: Compare return values, stdout, pass/fail status between original baseline and candidate - - Check `TestDiffScope`: `RETURN_VALUE`, `STDOUT`, `DID_PASS` - - Look at JUnit XML results for specific test failures -2. **Benchmark failures**: Check if candidate met `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup) -3. **Stability failures**: Check if timing was stable within `STABILITY_WINDOW_SIZE=0.35` +1. **Behavioral failures** — compare return values, stdout, pass/fail between baseline and candidate: + ```python + # TestDiffScope enum values to look for: + # RETURN_VALUE - function returned different value + # STDOUT - different stdout output + # DID_PASS - test passed/failed differently + ``` +2. **Benchmark failures** — candidate must beat `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup) +3. **Stability failures** — timing must be stable within `STABILITY_WINDOW_SIZE=0.35` (35% of iterations) +4. Check JUnit XML test results in the temp directory for specific failure messages -**If behavioral failure**: The optimization changed the function's behavior. Check test diffs for specific mismatches. -**If benchmark failure**: The optimization didn't provide enough speedup. +**Checkpoint**: Behavioral failure = optimization changed behavior (check test diffs). Benchmark failure = not fast enough. Stability failure = noisy timing environment. ## Step 6: Check Deduplication Verify candidates weren't deduplicated away. -1. `CandidateEvaluationContext.ast_code_to_id` tracks normalized code → candidate mapping -2. `normalize_code()` from `code_utils/deduplicate_code.py` normalizes AST for comparison -3. If all candidates normalize to the same code, only one is actually tested +1. `CandidateEvaluationContext.ast_code_to_id` tracks normalized AST → candidate mapping +2. `normalize_code()` from `code_utils/deduplicate_code.py` strips comments/whitespace and normalizes the AST +3. If all candidates normalize to identical code, only the first is tested — the rest copy its results -**If all duplicates**: The LLM generated the same optimization multiple times. Try higher effort level. +**Checkpoint**: If all duplicates, the LLM generated the same optimization repeatedly. Try a higher effort level for more diverse candidates. ## Step 7: Check Repair/Refinement If initial candidates failed, check repair and refinement stages. -1. Repair only runs if fewer than `MIN_CORRECT_CANDIDATES=2` passed -2. Repair sends `AIServiceCodeRepairRequest` with test diffs -3. Check `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` — if too many tests failed, repair is skipped -4. Refinement only runs on top valid candidates +1. Repair only triggers if fewer than `MIN_CORRECT_CANDIDATES=2` passed behavioral tests +2. Repair sends `AIServiceCodeRepairRequest` with `TestDiff` objects showing what went wrong +3. Check `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` (effort-dependent: 0.2/0.3/0.4) — if too many tests failed, repair is skipped entirely +4. Refinement only runs on the top valid candidates (count depends on effort level) -**If repair also failed**: The optimization approach may not work for this function. +**Checkpoint**: If repair also fails, the optimization approach likely doesn't work for this function. The function may rely on side effects or external state that the LLM can't safely optimize. -## Key Files to Check +## Key Files Reference -- `optimization/function_optimizer.py` — Main optimization loop, `determine_best_candidate()` -- `verification/test_runner.py` — Test execution -- `api/aiservice.py` — AI service communication -- `code_utils/config_consts.py` — Thresholds -- `context/code_context_extractor.py` — Context extraction -- `models/models.py` — `CandidateEvaluationContext`, `TestResults` +| File | What to check | +|------|---------------| +| `optimization/function_optimizer.py` | Main loop, `determine_best_candidate()` | +| `verification/test_runner.py` | Test subprocess execution | +| `api/aiservice.py` | AI service requests/responses | +| `code_utils/config_consts.py` | All thresholds and limits | +| `context/code_context_extractor.py` | Context extraction and token counting | +| `models/models.py` | `CandidateEvaluationContext`, `TestResults`, `TestDiff` | +| `code_utils/deduplicate_code.py` | AST normalization for deduplication | diff --git a/tiles/codeflash-skills/tile.json b/tiles/codeflash-skills/tile.json index 0dee84ce6..01d7a9481 100644 --- a/tiles/codeflash-skills/tile.json +++ b/tiles/codeflash-skills/tile.json @@ -1,6 +1,6 @@ { "name": "codeflash/codeflash-skills", - "version": "0.1.0", + "version": "0.2.0", "summary": "Procedural workflows for developing and debugging codeflash", "private": true, "skills": { From 289b75c555c2ce384cfc845e47572b126caee907 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 14 Feb 2026 21:08:25 -0500 Subject: [PATCH 3/5] chore: add tessl-managed gitignore for codex and gemini skill symlinks --- .codex/skills/.gitignore | 2 ++ .gemini/skills/.gitignore | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 .codex/skills/.gitignore create mode 100644 .gemini/skills/.gitignore diff --git a/.codex/skills/.gitignore b/.codex/skills/.gitignore new file mode 100644 index 000000000..b1cda282a --- /dev/null +++ b/.codex/skills/.gitignore @@ -0,0 +1,2 @@ +# Managed by Tessl +tessl:* diff --git a/.gemini/skills/.gitignore b/.gemini/skills/.gitignore new file mode 100644 index 000000000..b1cda282a --- /dev/null +++ b/.gemini/skills/.gitignore @@ -0,0 +1,2 @@ +# Managed by Tessl +tessl:* From ff2abd29f2a0d6fd62642c591b11325f027afc8b Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 14 Feb 2026 21:24:54 -0500 Subject: [PATCH 4/5] chore: add eval scenarios for codeflash-skills tile 5 scenarios testing: sequential debugging, Result type + effort config, test patterns, domain type conventions, and deduplication/repair mechanics. Also adds tessl-labs/tessl-skill-eval-scenarios dev dependency. --- tessl.json | 3 + .../codeflash-skills/evals/capabilities.json | 104 ++++++++++++++++++ .../evals/scenario-1/capability.txt | 1 + .../evals/scenario-1/criteria.json | 26 +++++ .../codeflash-skills/evals/scenario-1/task.md | 13 +++ .../evals/scenario-2/capability.txt | 1 + .../evals/scenario-2/criteria.json | 31 ++++++ .../codeflash-skills/evals/scenario-2/task.md | 21 ++++ .../evals/scenario-3/capability.txt | 1 + .../evals/scenario-3/criteria.json | 26 +++++ .../codeflash-skills/evals/scenario-3/task.md | 24 ++++ .../evals/scenario-4/capability.txt | 1 + .../evals/scenario-4/criteria.json | 26 +++++ .../codeflash-skills/evals/scenario-4/task.md | 21 ++++ .../evals/scenario-5/capability.txt | 1 + .../evals/scenario-5/criteria.json | 26 +++++ .../codeflash-skills/evals/scenario-5/task.md | 17 +++ tiles/codeflash-skills/evals/summary.json | 40 +++++++ .../evals/summary_infeasible.json | 25 +++++ 19 files changed, 408 insertions(+) create mode 100644 tiles/codeflash-skills/evals/capabilities.json create mode 100644 tiles/codeflash-skills/evals/scenario-1/capability.txt create mode 100644 tiles/codeflash-skills/evals/scenario-1/criteria.json create mode 100644 tiles/codeflash-skills/evals/scenario-1/task.md create mode 100644 tiles/codeflash-skills/evals/scenario-2/capability.txt create mode 100644 tiles/codeflash-skills/evals/scenario-2/criteria.json create mode 100644 tiles/codeflash-skills/evals/scenario-2/task.md create mode 100644 tiles/codeflash-skills/evals/scenario-3/capability.txt create mode 100644 tiles/codeflash-skills/evals/scenario-3/criteria.json create mode 100644 tiles/codeflash-skills/evals/scenario-3/task.md create mode 100644 tiles/codeflash-skills/evals/scenario-4/capability.txt create mode 100644 tiles/codeflash-skills/evals/scenario-4/criteria.json create mode 100644 tiles/codeflash-skills/evals/scenario-4/task.md create mode 100644 tiles/codeflash-skills/evals/scenario-5/capability.txt create mode 100644 tiles/codeflash-skills/evals/scenario-5/criteria.json create mode 100644 tiles/codeflash-skills/evals/scenario-5/task.md create mode 100644 tiles/codeflash-skills/evals/summary.json create mode 100644 tiles/codeflash-skills/evals/summary_infeasible.json diff --git a/tessl.json b/tessl.json index 2adf295be..d766df3ba 100644 --- a/tessl.json +++ b/tessl.json @@ -72,6 +72,9 @@ }, "codeflash/codeflash-skills": { "version": "0.2.0" + }, + "tessl-labs/tessl-skill-eval-scenarios": { + "version": "0.0.5" } } } diff --git a/tiles/codeflash-skills/evals/capabilities.json b/tiles/codeflash-skills/evals/capabilities.json new file mode 100644 index 000000000..cda33c968 --- /dev/null +++ b/tiles/codeflash-skills/evals/capabilities.json @@ -0,0 +1,104 @@ +{ + "package_name": "codeflash-skills", + "total_capabilities": 14, + "capabilities": [ + { + "id": 0, + "name": "sequential-pipeline-debugging", + "description": "Debug optimization failures by walking through pipeline stages sequentially and stopping at the first failure found", + "complexity": "intermediate", + "api_elements": ["discovery", "ranking", "context", "AI service", "verification", "deduplication", "repair"] + }, + { + "id": 1, + "name": "token-limit-awareness", + "description": "Know that OPTIMIZATION_CONTEXT_TOKEN_LIMIT and TESTGEN_CONTEXT_TOKEN_LIMIT are both 16000 tokens and that exceeding them causes function rejection", + "complexity": "basic", + "api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"] + }, + { + "id": 2, + "name": "improvement-threshold", + "description": "Know that MIN_IMPROVEMENT_THRESHOLD is 0.05 (5%) and candidates below this speedup are rejected", + "complexity": "basic", + "api_elements": ["MIN_IMPROVEMENT_THRESHOLD", "STABILITY_WINDOW_SIZE"] + }, + { + "id": 3, + "name": "ast-deduplication", + "description": "Know that candidates are deduplicated via AST normalization using normalize_code() and CandidateEvaluationContext.ast_code_to_id", + "complexity": "intermediate", + "api_elements": ["normalize_code()", "CandidateEvaluationContext.ast_code_to_id", "code_utils/deduplicate_code.py"] + }, + { + "id": 4, + "name": "repair-trigger-conditions", + "description": "Know that repair only triggers when fewer than MIN_CORRECT_CANDIDATES=2 pass, and is skipped when REPAIR_UNMATCHED_PERCENTAGE_LIMIT is exceeded", + "complexity": "advanced", + "api_elements": ["MIN_CORRECT_CANDIDATES", "REPAIR_UNMATCHED_PERCENTAGE_LIMIT", "AIServiceCodeRepairRequest"] + }, + { + "id": 5, + "name": "ai-service-error-patterns", + "description": "Know specific log patterns to search for when AI service fails: 'Error generating optimized candidates', 'cli-optimize-error-caught', 'cli-optimize-error-response'", + "complexity": "intermediate", + "api_elements": ["AiServiceClient", "api/aiservice.py"] + }, + { + "id": 6, + "name": "behavioral-vs-benchmark-failures", + "description": "Distinguish between behavioral test failures (return value/stdout/pass-fail mismatches via TestDiffScope) and benchmark failures (speedup below threshold)", + "complexity": "intermediate", + "api_elements": ["TestDiffScope", "RETURN_VALUE", "STDOUT", "DID_PASS"] + }, + { + "id": 7, + "name": "result-type-pattern", + "description": "Use Result[L, R] from either.py with Success/Failure constructors and is_successful() checks before unwrap()", + "complexity": "basic", + "api_elements": ["Result", "Success", "Failure", "is_successful", "unwrap()", "either.py"] + }, + { + "id": 8, + "name": "effort-config-pattern", + "description": "Add effort-dependent config via EffortKeys enum, EFFORT_VALUES dict with LOW/MEDIUM/HIGH levels, and get_effort_value()", + "complexity": "intermediate", + "api_elements": ["EffortKeys", "EffortLevel", "EFFORT_VALUES", "get_effort_value()", "config_consts.py"] + }, + { + "id": 9, + "name": "module-to-feature-mapping", + "description": "Know which codeflash module to modify for different feature types (optimization/ for strategies, api/ for endpoints, languages/ for language support, etc.)", + "complexity": "basic", + "api_elements": ["MODULE_REFERENCE.md"] + }, + { + "id": 10, + "name": "domain-type-conventions", + "description": "Use @dataclass(frozen=True) for immutable data, BaseModel for serializable models, and keep function_types.py dependency-free", + "complexity": "intermediate", + "api_elements": ["@dataclass(frozen=True)", "BaseModel", "models/models.py", "models/function_types.py"] + }, + { + "id": 11, + "name": "test-patterns", + "description": "Use tmp_path fixture, .resolve() on Paths, .as_posix() for string conversion, full string equality assertions, and awareness of deterministic patches", + "complexity": "basic", + "api_elements": ["tmp_path", ".resolve()", ".as_posix()", "pytest_plugin.py"] + }, + { + "id": 12, + "name": "quality-check-commands", + "description": "Run uv run prek run for formatting/linting, uv run mypy for type checking, and uv run pytest for tests", + "complexity": "basic", + "api_elements": ["uv run prek run", "uv run mypy", "uv run pytest"] + }, + { + "id": 13, + "name": "language-support-patterns", + "description": "Use @register_language decorator, get_language_support() for lookup, singleton pattern via set_current_language()/current_language(), and is_python()/is_javascript() guards", + "complexity": "advanced", + "api_elements": ["@register_language", "get_language_support()", "set_current_language()", "is_python()", "is_javascript()"] + } + ] +} diff --git a/tiles/codeflash-skills/evals/scenario-1/capability.txt b/tiles/codeflash-skills/evals/scenario-1/capability.txt new file mode 100644 index 000000000..c4d34b1aa --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-1/capability.txt @@ -0,0 +1 @@ +Sequential pipeline debugging with specific thresholds \ No newline at end of file diff --git a/tiles/codeflash-skills/evals/scenario-1/criteria.json b/tiles/codeflash-skills/evals/scenario-1/criteria.json new file mode 100644 index 000000000..cec7afda7 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-1/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent follows the sequential debugging workflow from the skill, checking pipeline stages in order and using correct threshold values when diagnosing an optimization that produced no results.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Sequential stage order", + "description": "Investigates pipeline stages in order: discovery before ranking before context before AI service before test failures. Does NOT jump to later stages without checking earlier ones first.", + "max_score": 25 + }, + { + "name": "Token limit value", + "description": "References the specific token limit of 16000 for OPTIMIZATION_CONTEXT_TOKEN_LIMIT or TESTGEN_CONTEXT_TOKEN_LIMIT when checking context extraction", + "max_score": 25 + }, + { + "name": "Importance threshold", + "description": "References DEFAULT_IMPORTANCE_THRESHOLD=0.001 when checking function ranking", + "max_score": 25 + }, + { + "name": "Stops at failure", + "description": "Identifies the failing stage and focuses investigation there rather than continuing through all remaining stages", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-skills/evals/scenario-1/task.md b/tiles/codeflash-skills/evals/scenario-1/task.md new file mode 100644 index 000000000..17c74d8cb --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-1/task.md @@ -0,0 +1,13 @@ +# Diagnose Silent Optimization Skip + +## Context + +A user reports that when running codeflash on their project, a specific function `calculate_metrics` in `analytics/processor.py` never appears in the optimization results. The function exists in the module root, is not in the exclude list, and has not been previously optimized. Trace data shows the function is called frequently but with very short execution times (averaging 0.0005 seconds total addressable time). The function has moderate dependencies. + +## Task + +Write a diagnostic report explaining why this function is being skipped and at which stage in the pipeline the function is filtered out. Include the specific threshold or condition that causes the skip. + +## Expected Outputs + +A markdown file `diagnostic-report.md` explaining the root cause. diff --git a/tiles/codeflash-skills/evals/scenario-2/capability.txt b/tiles/codeflash-skills/evals/scenario-2/capability.txt new file mode 100644 index 000000000..72b283863 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-2/capability.txt @@ -0,0 +1 @@ +Result type pattern and effort-dependent configuration \ No newline at end of file diff --git a/tiles/codeflash-skills/evals/scenario-2/criteria.json b/tiles/codeflash-skills/evals/scenario-2/criteria.json new file mode 100644 index 000000000..9c49891b8 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-2/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests whether the agent uses the codeflash Result type pattern from either.py and the effort-dependent configuration pattern when implementing a new pipeline feature.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Imports from either.py", + "description": "Imports Success, Failure, and is_successful from codeflash.either (NOT from a different error handling module)", + "max_score": 20 + }, + { + "name": "Result return type", + "description": "Function returns Result type using Success() for success and Failure() for errors, not exceptions or None", + "max_score": 20 + }, + { + "name": "is_successful check", + "description": "Calls is_successful() or .is_successful() before calling unwrap() on the result", + "max_score": 20 + }, + { + "name": "EffortKeys enum entry", + "description": "Adds a new entry to the EffortKeys enum in config_consts.py", + "max_score": 20 + }, + { + "name": "Three effort levels", + "description": "Adds values for all three EffortLevel variants (LOW, MEDIUM, HIGH) in EFFORT_VALUES dict", + "max_score": 20 + } + ] +} diff --git a/tiles/codeflash-skills/evals/scenario-2/task.md b/tiles/codeflash-skills/evals/scenario-2/task.md new file mode 100644 index 000000000..dfe684d14 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-2/task.md @@ -0,0 +1,21 @@ +# Add Candidate Timeout Feature + +## Context + +The codeflash optimization engine currently has no per-candidate timeout. Some candidates take too long during verification, wasting the optimization budget. A new feature is needed to skip candidates that exceed a configurable time limit during behavioral testing. + +The timeout should vary based on the optimization effort setting — shorter timeouts for low effort runs (to save time) and longer for high effort runs (to allow more complex optimizations). + +## Task + +Implement a `check_candidate_timeout` function in `codeflash/optimization/function_optimizer.py` that: +1. Takes a candidate runtime and returns whether the candidate should be skipped +2. Uses a configurable timeout threshold that scales with optimization effort +3. Handles the error case where the runtime measurement is unavailable + +Also add the necessary configuration constant to `codeflash/code_utils/config_consts.py`. + +## Expected Outputs + +- Modified `function_optimizer.py` with the new function +- Modified `config_consts.py` with the new configuration diff --git a/tiles/codeflash-skills/evals/scenario-3/capability.txt b/tiles/codeflash-skills/evals/scenario-3/capability.txt new file mode 100644 index 000000000..1fa504dee --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-3/capability.txt @@ -0,0 +1 @@ +Test patterns and deterministic patch awareness \ No newline at end of file diff --git a/tiles/codeflash-skills/evals/scenario-3/criteria.json b/tiles/codeflash-skills/evals/scenario-3/criteria.json new file mode 100644 index 000000000..ccf96e3fa --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-3/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent follows codeflash test conventions when writing tests, including path handling, temp directory patterns, and awareness of the deterministic patching system.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Uses tmp_path fixture", + "description": "Test function uses pytest tmp_path fixture parameter, NOT tempfile.NamedTemporaryFile or tempfile.mkdtemp", + "max_score": 25 + }, + { + "name": "Calls resolve on paths", + "description": "Calls .resolve() on Path objects before using them in assertions or function calls", + "max_score": 25 + }, + { + "name": "Full string equality", + "description": "Uses exact equality assertions (== or assert_equal) for code string comparisons, NOT substring checks like 'in' or assertIn or contains", + "max_score": 25 + }, + { + "name": "No real time dependency", + "description": "Test does NOT depend on real time.time(), datetime.now(), random values, or uuid generation for correctness. Acknowledges or accounts for deterministic patches if time/random values are involved.", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-skills/evals/scenario-3/task.md b/tiles/codeflash-skills/evals/scenario-3/task.md new file mode 100644 index 000000000..5b13a15d6 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-3/task.md @@ -0,0 +1,24 @@ +# Write Tests for Context Hash Comparison + +## Context + +The codeflash context extraction module has a function `compare_context_hashes(context_a, context_b)` that takes two `CodeOptimizationContext` objects and returns whether their hashing contexts are identical. This is used to detect when the same function has already been optimized. + +```python +# In codeflash/context/code_context_extractor.py +def compare_context_hashes(context_a: CodeOptimizationContext, context_b: CodeOptimizationContext) -> bool: + return context_a.hashing_code_context_hash == context_b.hashing_code_context_hash +``` + +## Task + +Write a test file `tests/test_context/test_hash_comparison.py` with tests for this function. Include tests for: +1. Two contexts with identical code producing the same hash +2. Two contexts with different code producing different hashes +3. A context compared with itself + +The tests should create temporary Python source files to build realistic context objects. + +## Expected Outputs + +- `tests/test_context/test_hash_comparison.py` diff --git a/tiles/codeflash-skills/evals/scenario-4/capability.txt b/tiles/codeflash-skills/evals/scenario-4/capability.txt new file mode 100644 index 000000000..c0d3fea71 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-4/capability.txt @@ -0,0 +1 @@ +Domain type conventions and module identification \ No newline at end of file diff --git a/tiles/codeflash-skills/evals/scenario-4/criteria.json b/tiles/codeflash-skills/evals/scenario-4/criteria.json new file mode 100644 index 000000000..20861011c --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-4/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent follows codeflash domain type conventions and correctly identifies the right module when adding a new data type for the optimization pipeline.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Placed in models/models.py", + "description": "New data type is added to codeflash/models/models.py (NOT models/function_types.py, since it has dependencies on other codeflash modules)", + "max_score": 25 + }, + { + "name": "Uses frozen dataclass", + "description": "Immutable data type uses @dataclass(frozen=True) decorator, NOT a regular class or unfrozen dataclass", + "max_score": 25 + }, + { + "name": "BaseModel for serializable", + "description": "If a serializable model is needed, uses Pydantic BaseModel (NOT dataclass or dict)", + "max_score": 25 + }, + { + "name": "Correct module for feature", + "description": "Places the main logic in the correct module for the feature type (e.g., verification/ for test-related, optimization/ for candidate-related, api/ for service-related)", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-skills/evals/scenario-4/task.md b/tiles/codeflash-skills/evals/scenario-4/task.md new file mode 100644 index 000000000..61299a115 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-4/task.md @@ -0,0 +1,21 @@ +# Add Optimization Confidence Score + +## Context + +The codeflash team wants to add a confidence score to each optimization result. The score should capture how confident the system is that an optimization is both correct and beneficial. It combines test coverage percentage, number of passing test cases, and speedup stability into a single metric. + +The score needs to be: +- Attached to each candidate during evaluation (immutable once computed) +- Included in the final PR report (needs JSON serialization) +- Computed during the candidate evaluation phase + +## Task + +1. Define the data types needed for the confidence score +2. Write a `compute_confidence_score` function that takes coverage percentage (float), passing test count (int), and stability ratio (float) and returns the confidence result +3. Place all code in the appropriate codeflash modules + +## Expected Outputs + +- New/modified type definitions in the appropriate models file +- New function in the appropriate module diff --git a/tiles/codeflash-skills/evals/scenario-5/capability.txt b/tiles/codeflash-skills/evals/scenario-5/capability.txt new file mode 100644 index 000000000..28a3fe8ee --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-5/capability.txt @@ -0,0 +1 @@ +Deduplication mechanics and repair trigger conditions \ No newline at end of file diff --git a/tiles/codeflash-skills/evals/scenario-5/criteria.json b/tiles/codeflash-skills/evals/scenario-5/criteria.json new file mode 100644 index 000000000..8c3f8e817 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-5/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent understands codeflash's candidate deduplication via AST normalization and the specific conditions under which code repair is triggered vs skipped.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "AST normalization", + "description": "Mentions that deduplication uses AST normalization (normalize_code from code_utils/deduplicate_code.py), NOT simple string comparison", + "max_score": 25 + }, + { + "name": "Duplicate result copying", + "description": "Explains that duplicate candidates copy results from the first-seen candidate rather than being re-tested", + "max_score": 25 + }, + { + "name": "Repair trigger threshold", + "description": "States that repair triggers when fewer than 2 candidates pass (MIN_CORRECT_CANDIDATES=2), NOT when zero candidates pass or when any candidate fails", + "max_score": 25 + }, + { + "name": "Unmatched percentage limit", + "description": "Mentions REPAIR_UNMATCHED_PERCENTAGE_LIMIT as a condition that can cause repair to be skipped entirely, with effort-dependent values (0.2/0.3/0.4)", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-skills/evals/scenario-5/task.md b/tiles/codeflash-skills/evals/scenario-5/task.md new file mode 100644 index 000000000..19995f3e6 --- /dev/null +++ b/tiles/codeflash-skills/evals/scenario-5/task.md @@ -0,0 +1,17 @@ +# Investigate Low Candidate Diversity + +## Context + +A codeflash user is optimizing a data processing function at medium effort level. The AI service returns 5 candidates, but the optimization log shows only 1 candidate was actually benchmarked. Of the 5 candidates, 1 passed behavioral tests but didn't meet the performance threshold. The user wants to understand what happened to the other 4 candidates and why no repair attempts were made. + +## Task + +Write an analysis document explaining: +1. Why only 1 out of 5 candidates was benchmarked +2. How the system determines which candidates to actually test +3. Under what conditions the system would have attempted to repair the failing candidates +4. What the user could change to get more diverse results + +## Expected Outputs + +A markdown file `analysis.md` with the explanation. diff --git a/tiles/codeflash-skills/evals/summary.json b/tiles/codeflash-skills/evals/summary.json new file mode 100644 index 000000000..c5929299f --- /dev/null +++ b/tiles/codeflash-skills/evals/summary.json @@ -0,0 +1,40 @@ +{ + "total_scenarios": 5, + "capabilities_coverage": { + "total_capabilities": 14, + "capabilities_tested": 10, + "coverage_percentage": 71.4 + }, + "complexity_distribution": { + "basic": 2, + "intermediate": 2, + "advanced": 1 + }, + "scenarios": [ + { + "index": 1, + "capability": "sequential-pipeline-debugging, token-limit-awareness, improvement-threshold", + "complexity": "intermediate" + }, + { + "index": 2, + "capability": "result-type-pattern, effort-config-pattern", + "complexity": "intermediate" + }, + { + "index": 3, + "capability": "test-patterns, quality-check-commands", + "complexity": "basic" + }, + { + "index": 4, + "capability": "domain-type-conventions, module-to-feature-mapping", + "complexity": "basic" + }, + { + "index": 5, + "capability": "ast-deduplication, repair-trigger-conditions", + "complexity": "advanced" + } + ] +} diff --git a/tiles/codeflash-skills/evals/summary_infeasible.json b/tiles/codeflash-skills/evals/summary_infeasible.json new file mode 100644 index 000000000..36da50727 --- /dev/null +++ b/tiles/codeflash-skills/evals/summary_infeasible.json @@ -0,0 +1,25 @@ +{ + "total_infeasible": 4, + "infeasible_capabilities": [ + { + "capability": "ai-service-error-patterns", + "complexity": "intermediate", + "reasoning": "Requires actual AI service API responses and log output that cannot be meaningfully mocked without bypassing the capability being tested" + }, + { + "capability": "behavioral-vs-benchmark-failures", + "complexity": "intermediate", + "reasoning": "Requires actual test execution results with JUnit XML output and timing data that cannot be generated in a one-shot file-based eval" + }, + { + "capability": "language-support-patterns", + "complexity": "advanced", + "reasoning": "Requires the full language registry system with imports and decorators that would need the codeflash runtime to verify" + }, + { + "capability": "quality-check-commands", + "complexity": "basic", + "reasoning": "Requires running actual uv/prek/mypy commands which need the project environment and dependencies installed" + } + ] +} From 869fbe176666bf694f1f5ec7653ffc7fdab9a43c Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 14 Feb 2026 21:29:22 -0500 Subject: [PATCH 5/5] chore: add eval scenarios for codeflash-docs tile 5 scenarios testing: code serialization format, candidate lifecycle/DAG, deterministic patches, effort levels/selection criteria, and function representation/concurrency model. --- tiles/codeflash-docs/evals/capabilities.json | 118 ++++++++++++++++++ .../evals/scenario-1/capability.txt | 1 + .../evals/scenario-1/criteria.json | 21 ++++ tiles/codeflash-docs/evals/scenario-1/task.md | 35 ++++++ .../evals/scenario-2/capability.txt | 1 + .../evals/scenario-2/criteria.json | 26 ++++ tiles/codeflash-docs/evals/scenario-2/task.md | 13 ++ .../evals/scenario-3/capability.txt | 1 + .../evals/scenario-3/criteria.json | 31 +++++ tiles/codeflash-docs/evals/scenario-3/task.md | 13 ++ .../evals/scenario-4/capability.txt | 1 + .../evals/scenario-4/criteria.json | 26 ++++ tiles/codeflash-docs/evals/scenario-4/task.md | 18 +++ .../evals/scenario-5/capability.txt | 1 + .../evals/scenario-5/criteria.json | 26 ++++ tiles/codeflash-docs/evals/scenario-5/task.md | 17 +++ tiles/codeflash-docs/evals/summary.json | 40 ++++++ .../evals/summary_infeasible.json | 25 ++++ 18 files changed, 414 insertions(+) create mode 100644 tiles/codeflash-docs/evals/capabilities.json create mode 100644 tiles/codeflash-docs/evals/scenario-1/capability.txt create mode 100644 tiles/codeflash-docs/evals/scenario-1/criteria.json create mode 100644 tiles/codeflash-docs/evals/scenario-1/task.md create mode 100644 tiles/codeflash-docs/evals/scenario-2/capability.txt create mode 100644 tiles/codeflash-docs/evals/scenario-2/criteria.json create mode 100644 tiles/codeflash-docs/evals/scenario-2/task.md create mode 100644 tiles/codeflash-docs/evals/scenario-3/capability.txt create mode 100644 tiles/codeflash-docs/evals/scenario-3/criteria.json create mode 100644 tiles/codeflash-docs/evals/scenario-3/task.md create mode 100644 tiles/codeflash-docs/evals/scenario-4/capability.txt create mode 100644 tiles/codeflash-docs/evals/scenario-4/criteria.json create mode 100644 tiles/codeflash-docs/evals/scenario-4/task.md create mode 100644 tiles/codeflash-docs/evals/scenario-5/capability.txt create mode 100644 tiles/codeflash-docs/evals/scenario-5/criteria.json create mode 100644 tiles/codeflash-docs/evals/scenario-5/task.md create mode 100644 tiles/codeflash-docs/evals/summary.json create mode 100644 tiles/codeflash-docs/evals/summary_infeasible.json diff --git a/tiles/codeflash-docs/evals/capabilities.json b/tiles/codeflash-docs/evals/capabilities.json new file mode 100644 index 000000000..1e39768a4 --- /dev/null +++ b/tiles/codeflash-docs/evals/capabilities.json @@ -0,0 +1,118 @@ +{ + "package_name": "codeflash-docs", + "total_capabilities": 16, + "capabilities": [ + { + "id": 0, + "name": "pipeline-stage-ordering", + "description": "Know the correct ordering of codeflash pipeline stages: Discovery → Ranking → Context Extraction → Test Gen + Optimization (concurrent) → Baseline → Candidate Evaluation → PR", + "complexity": "basic", + "api_elements": ["Optimizer.run()", "FunctionOptimizer.optimize_function()"] + }, + { + "id": 1, + "name": "function-to-optimize-fields", + "description": "Know FunctionToOptimize key fields (function_name, file_path, parents, starting_line/ending_line, is_async, is_method, language) and properties (qualified_name, top_level_parent_name, class_name)", + "complexity": "intermediate", + "api_elements": ["FunctionToOptimize", "FunctionParent", "models/function_types.py"] + }, + { + "id": 2, + "name": "code-strings-markdown-format", + "description": "Know that code is serialized as markdown fenced blocks with language:filepath syntax (```python:filepath\\ncode\\n```) and parsed via CodeStringsMarkdown.parse_markdown_code()", + "complexity": "intermediate", + "api_elements": ["CodeStringsMarkdown", "CodeString", ".markdown", ".flat", "parse_markdown_code()"] + }, + { + "id": 3, + "name": "read-writable-vs-read-only", + "description": "Distinguish read_writable_code (LLM can modify) from read_only_context_code (reference only) in CodeOptimizationContext", + "complexity": "basic", + "api_elements": ["CodeOptimizationContext", "read_writable_code", "read_only_context_code"] + }, + { + "id": 4, + "name": "candidate-source-types", + "description": "Know OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE and when each is used", + "complexity": "intermediate", + "api_elements": ["OptimizedCandidateSource", "OptimizedCandidate"] + }, + { + "id": 5, + "name": "candidate-forest-dag", + "description": "Know that candidates form a forest/DAG via parent_id references where refinements and repairs build on previous candidates", + "complexity": "intermediate", + "api_elements": ["parent_id", "OptimizedCandidate", "CandidateForest"] + }, + { + "id": 6, + "name": "concurrent-testgen-optimization", + "description": "Know that test generation and LLM optimization run concurrently using concurrent.futures, not sequentially", + "complexity": "intermediate", + "api_elements": ["concurrent.futures", "FunctionOptimizer.optimize_function()"] + }, + { + "id": 7, + "name": "deterministic-patch-values", + "description": "Know the specific fixed values used by deterministic patches: time=1761717605.108106, datetime=2021-01-01 02:05:10 UTC, uuid=12345678-1234-5678-9abc-123456789012, random seeded with 42", + "complexity": "advanced", + "api_elements": ["_apply_deterministic_patches()", "pytest_plugin.py"] + }, + { + "id": 8, + "name": "test-type-enum", + "description": "Know the 6 TestType variants: EXISTING_UNIT_TEST, INSPIRED_REGRESSION, GENERATED_REGRESSION, REPLAY_TEST, CONCOLIC_COVERAGE_TEST, INIT_STATE_TEST", + "complexity": "basic", + "api_elements": ["TestType", "models/test_type.py"] + }, + { + "id": 9, + "name": "ai-service-endpoints", + "description": "Know the AI service endpoints: /ai/optimize, /ai/optimize_line_profiler, /ai/refine, /ai/repair, /ai/adaptive_optimize, /ai/rewrite_jit", + "complexity": "intermediate", + "api_elements": ["AiServiceClient", "api/aiservice.py"] + }, + { + "id": 10, + "name": "repair-request-structure", + "description": "Know that AIServiceCodeRepairRequest includes TestDiff objects with scope (RETURN_VALUE/STDOUT/DID_PASS), original vs candidate values, and test source code", + "complexity": "advanced", + "api_elements": ["AIServiceCodeRepairRequest", "TestDiff", "TestDiffScope"] + }, + { + "id": 11, + "name": "effort-level-values", + "description": "Know specific effort level values: LOW gets 3 candidates, MEDIUM gets 5, HIGH gets 6 (N_OPTIMIZER_CANDIDATES)", + "complexity": "intermediate", + "api_elements": ["EffortLevel", "N_OPTIMIZER_CANDIDATES", "EFFORT_VALUES"] + }, + { + "id": 12, + "name": "context-token-limits", + "description": "Know OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000 and TESTGEN_CONTEXT_TOKEN_LIMIT=16000 and that encoded_tokens_len() is used for counting", + "complexity": "basic", + "api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"] + }, + { + "id": 13, + "name": "best-candidate-selection", + "description": "Know the selection criteria: highest speedup, then shortest diff for ties, and refinement weighted ranking (2*runtime + 1*diff)", + "complexity": "advanced", + "api_elements": ["BestOptimization", "REFINED_CANDIDATE_RANKING_WEIGHTS"] + }, + { + "id": 14, + "name": "plugin-blocklists", + "description": "Know behavioral test blocklisted plugins (benchmark, codspeed, xdist, sugar) and benchmarking blocklist (adds cov, profiling)", + "complexity": "intermediate", + "api_elements": ["BEHAVIORAL_BLOCKLISTED_PLUGINS", "BENCHMARKING_BLOCKLISTED_PLUGINS"] + }, + { + "id": 15, + "name": "result-type-usage", + "description": "Know that Result[L,R] from either.py uses Success(value)/Failure(error) with is_successful() check before unwrap()", + "complexity": "basic", + "api_elements": ["Result", "Success", "Failure", "is_successful", "either.py"] + } + ] +} diff --git a/tiles/codeflash-docs/evals/scenario-1/capability.txt b/tiles/codeflash-docs/evals/scenario-1/capability.txt new file mode 100644 index 000000000..5bd3f0115 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-1/capability.txt @@ -0,0 +1 @@ +Code serialization format and context splitting \ No newline at end of file diff --git a/tiles/codeflash-docs/evals/scenario-1/criteria.json b/tiles/codeflash-docs/evals/scenario-1/criteria.json new file mode 100644 index 000000000..48a4eb178 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-1/criteria.json @@ -0,0 +1,21 @@ +{ + "context": "Tests whether the agent knows the CodeStringsMarkdown serialization format and the distinction between read-writable and read-only code context in the codeflash pipeline.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Markdown code block format", + "description": "Uses the correct fenced code block format with language:filepath syntax (```python:path/to/file.py) when constructing code for the AI service, NOT plain code blocks without file paths", + "max_score": 30 + }, + { + "name": "Read-writable vs read-only split", + "description": "Correctly separates code into read_writable_code (code the LLM can modify) and read_only_context_code (reference-only dependency code), NOT treating all code as modifiable", + "max_score": 35 + }, + { + "name": "parse_markdown_code usage", + "description": "Uses CodeStringsMarkdown.parse_markdown_code() to parse AI service responses back into structured code, NOT manual string splitting or regex", + "max_score": 35 + } + ] +} diff --git a/tiles/codeflash-docs/evals/scenario-1/task.md b/tiles/codeflash-docs/evals/scenario-1/task.md new file mode 100644 index 000000000..93761be4b --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-1/task.md @@ -0,0 +1,35 @@ +# Format Code for AI Service Request + +## Context + +You are working on the codeflash optimization engine. The AI service accepts optimization requests with source code and dependency context. A function `calculate_total` in `analytics/metrics.py` needs to be optimized. It calls a helper `normalize_values` in the same file (both modifiable), and imports `BaseMetric` from `analytics/base.py` (not modifiable, just for reference). + +```python +# analytics/metrics.py +from analytics.base import BaseMetric + +def normalize_values(data: list[float]) -> list[float]: + max_val = max(data) + return [x / max_val for x in data] + +def calculate_total(metrics: list[BaseMetric]) -> float: + values = [m.value for m in metrics] + normalized = normalize_values(values) + return sum(normalized) +``` + +```python +# analytics/base.py +class BaseMetric: + def __init__(self, name: str, value: float): + self.name = name + self.value = value +``` + +## Task + +Write a Python function `prepare_optimization_payload` that constructs the code payload for an AI service optimization request for `calculate_total`. It should properly format the source code and dependency code, and include a function to parse the AI service response back into structured code objects. + +## Expected Outputs + +- A Python file `payload_builder.py` with the payload construction and response parsing logic diff --git a/tiles/codeflash-docs/evals/scenario-2/capability.txt b/tiles/codeflash-docs/evals/scenario-2/capability.txt new file mode 100644 index 000000000..5afa5a2e4 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-2/capability.txt @@ -0,0 +1 @@ +Candidate source types and DAG relationships \ No newline at end of file diff --git a/tiles/codeflash-docs/evals/scenario-2/criteria.json b/tiles/codeflash-docs/evals/scenario-2/criteria.json new file mode 100644 index 000000000..8460c1420 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-2/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Lists source types", + "description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE", + "max_score": 25 + }, + { + "name": "Parent ID linkage", + "description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates", + "max_score": 25 + }, + { + "name": "Refinement uses runtime data", + "description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code", + "max_score": 25 + }, + { + "name": "Repair uses test diffs", + "description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-docs/evals/scenario-2/task.md b/tiles/codeflash-docs/evals/scenario-2/task.md new file mode 100644 index 000000000..f55b25e3e --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-2/task.md @@ -0,0 +1,13 @@ +# Document the Candidate Lifecycle + +## Context + +A new engineer is joining the codeflash team and needs to understand how optimization candidates are generated, improved, and related to each other throughout the pipeline. They've asked for a clear explanation of the different ways candidates are produced and how the system iterates on them. + +## Task + +Write a technical document explaining the full lifecycle of an optimization candidate in codeflash — from initial generation through improvement iterations. Cover all the different ways candidates can be created, what data is sent to the AI service for each type, and how candidates relate to each other structurally. + +## Expected Outputs + +- A markdown file `candidate-lifecycle.md` diff --git a/tiles/codeflash-docs/evals/scenario-3/capability.txt b/tiles/codeflash-docs/evals/scenario-3/capability.txt new file mode 100644 index 000000000..707dd8109 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-3/capability.txt @@ -0,0 +1 @@ +Deterministic patch values and test execution architecture \ No newline at end of file diff --git a/tiles/codeflash-docs/evals/scenario-3/criteria.json b/tiles/codeflash-docs/evals/scenario-3/criteria.json new file mode 100644 index 000000000..bf5c9f34f --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-3/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests whether the agent knows the specific deterministic patch values used in codeflash's pytest plugin and the subprocess-based test execution architecture.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Subprocess isolation", + "description": "States that tests run in a subprocess to isolate the test environment from the main codeflash process, NOT in the same process", + "max_score": 20 + }, + { + "name": "Fixed time value", + "description": "References the specific fixed timestamp 1761717605.108106 for time.time() or the fixed datetime 2021-01-01 02:05:10 UTC for datetime.now()", + "max_score": 20 + }, + { + "name": "Fixed UUID value", + "description": "References the specific fixed UUID 12345678-1234-5678-9abc-123456789012 for uuid4/uuid1", + "max_score": 20 + }, + { + "name": "Random seed", + "description": "States that random is seeded with 42 (NOT a different seed value)", + "max_score": 20 + }, + { + "name": "Plugin blocklists", + "description": "Mentions that behavioral tests block specific pytest plugins (at least 2 of: benchmark, codspeed, xdist, sugar) to ensure deterministic execution", + "max_score": 20 + } + ] +} diff --git a/tiles/codeflash-docs/evals/scenario-3/task.md b/tiles/codeflash-docs/evals/scenario-3/task.md new file mode 100644 index 000000000..b3970b839 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-3/task.md @@ -0,0 +1,13 @@ +# Explain Test Reproducibility Guarantees + +## Context + +A codeflash user notices that their optimization candidate passes behavioral tests on one run but fails on the next. They suspect non-determinism in the test execution. They want to understand what guarantees codeflash provides for test reproducibility and how the system ensures consistent results. + +## Task + +Write a technical explanation of how codeflash ensures deterministic test execution. Cover the execution environment setup, what sources of non-determinism are controlled, and any specific values or configurations used. Also explain the test execution architecture. + +## Expected Outputs + +- A markdown file `test-reproducibility.md` diff --git a/tiles/codeflash-docs/evals/scenario-4/capability.txt b/tiles/codeflash-docs/evals/scenario-4/capability.txt new file mode 100644 index 000000000..64848618a --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-4/capability.txt @@ -0,0 +1 @@ +Effort level configuration and candidate selection criteria \ No newline at end of file diff --git a/tiles/codeflash-docs/evals/scenario-4/criteria.json b/tiles/codeflash-docs/evals/scenario-4/criteria.json new file mode 100644 index 000000000..4fdc078ae --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-4/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent knows the specific effort level values for candidate generation and the criteria used to select the best optimization candidate.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Candidate counts by effort", + "description": "States correct N_OPTIMIZER_CANDIDATES values: LOW=3, MEDIUM=5, HIGH=6 (at least 2 of 3 correct)", + "max_score": 25 + }, + { + "name": "Speedup as primary selector", + "description": "States that the winning candidate is selected primarily by highest speedup ratio", + "max_score": 25 + }, + { + "name": "Diff length as tiebreaker", + "description": "States that for tied speedups, shortest diff length from original is used as tiebreaker", + "max_score": 25 + }, + { + "name": "Refinement ranking weights", + "description": "States that refinement candidates use weighted ranking with runtime weighted more heavily than diff (2:1 ratio or REFINED_CANDIDATE_RANKING_WEIGHTS=(2,1))", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-docs/evals/scenario-4/task.md b/tiles/codeflash-docs/evals/scenario-4/task.md new file mode 100644 index 000000000..e44e2738d --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-4/task.md @@ -0,0 +1,18 @@ +# Design a Candidate Selection Dashboard + +## Context + +The codeflash team wants to build a dashboard that shows users how optimization candidates were evaluated and why a particular candidate won. The dashboard needs to display the selection process at each stage, from initial candidate pool through to the final winner. + +## Task + +Write a specification document for the dashboard that explains: +1. How many candidates are generated at each effort level +2. The exact criteria and order of operations used to pick the winning candidate +3. How refinement candidates are ranked differently from initial candidates + +Include concrete examples showing how two hypothetical candidates would be compared. + +## Expected Outputs + +- A markdown file `selection-dashboard-spec.md` diff --git a/tiles/codeflash-docs/evals/scenario-5/capability.txt b/tiles/codeflash-docs/evals/scenario-5/capability.txt new file mode 100644 index 000000000..0ec01e24f --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-5/capability.txt @@ -0,0 +1 @@ +Pipeline concurrency and FunctionToOptimize structure \ No newline at end of file diff --git a/tiles/codeflash-docs/evals/scenario-5/criteria.json b/tiles/codeflash-docs/evals/scenario-5/criteria.json new file mode 100644 index 000000000..13887ac34 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-5/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent knows the FunctionToOptimize data structure and the concurrent execution model for test generation and optimization.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "FunctionToOptimize fields", + "description": "Includes at least 4 of: function_name, file_path, parents (list of FunctionParent), starting_line, ending_line, is_async, is_method, language", + "max_score": 25 + }, + { + "name": "Qualified name property", + "description": "Mentions qualified_name as a property that produces the full dotted name including parent classes (e.g., MyClass.my_method)", + "max_score": 25 + }, + { + "name": "Concurrent execution", + "description": "States that test generation and LLM optimization run concurrently (in parallel), NOT sequentially one after the other", + "max_score": 25 + }, + { + "name": "Entry point identification", + "description": "Correctly identifies Optimizer.run() as the top-level entry point and FunctionOptimizer.optimize_function() as the per-function entry point", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-docs/evals/scenario-5/task.md b/tiles/codeflash-docs/evals/scenario-5/task.md new file mode 100644 index 000000000..42cb34653 --- /dev/null +++ b/tiles/codeflash-docs/evals/scenario-5/task.md @@ -0,0 +1,17 @@ +# Implement a Function Optimization Status Tracker + +## Context + +The codeflash team needs a status tracker that logs what happens to each function during an optimization run. For each function, it should record the function identity, which pipeline stages it passed through, and how long each stage took. + +## Task + +Write a design document explaining: +1. What data structure represents a function being optimized, including its identity fields and how nested functions (methods inside classes) are represented +2. The full name resolution strategy for identifying functions uniquely +3. Which stages of the pipeline operate on a single function at a time vs. operating on multiple functions +4. Where in the codebase the per-function optimization is orchestrated and what the top-level entry point is + +## Expected Outputs + +- A markdown file `status-tracker-design.md` diff --git a/tiles/codeflash-docs/evals/summary.json b/tiles/codeflash-docs/evals/summary.json new file mode 100644 index 000000000..38e0ca577 --- /dev/null +++ b/tiles/codeflash-docs/evals/summary.json @@ -0,0 +1,40 @@ +{ + "total_scenarios": 5, + "capabilities_coverage": { + "total_capabilities": 16, + "capabilities_tested": 12, + "coverage_percentage": 75.0 + }, + "complexity_distribution": { + "basic": 1, + "intermediate": 3, + "advanced": 1 + }, + "scenarios": [ + { + "index": 1, + "capability": "code-strings-markdown-format, read-writable-vs-read-only", + "complexity": "intermediate" + }, + { + "index": 2, + "capability": "candidate-source-types, candidate-forest-dag, repair-request-structure", + "complexity": "intermediate" + }, + { + "index": 3, + "capability": "deterministic-patch-values, plugin-blocklists", + "complexity": "advanced" + }, + { + "index": 4, + "capability": "effort-level-values, best-candidate-selection", + "complexity": "intermediate" + }, + { + "index": 5, + "capability": "function-to-optimize-fields, concurrent-testgen-optimization, pipeline-stage-ordering", + "complexity": "basic" + } + ] +} diff --git a/tiles/codeflash-docs/evals/summary_infeasible.json b/tiles/codeflash-docs/evals/summary_infeasible.json new file mode 100644 index 000000000..7450bd0b1 --- /dev/null +++ b/tiles/codeflash-docs/evals/summary_infeasible.json @@ -0,0 +1,25 @@ +{ + "total_infeasible": 4, + "infeasible_capabilities": [ + { + "capability": "ai-service-endpoints", + "complexity": "intermediate", + "reasoning": "Testing knowledge of specific API endpoints requires actual HTTP requests or mocking that bypasses the capability being tested" + }, + { + "capability": "context-token-limits", + "complexity": "basic", + "reasoning": "Already covered by the skills tile eval (scenario-1). Testing token counting requires the actual tokenizer library" + }, + { + "capability": "test-type-enum", + "complexity": "basic", + "reasoning": "Simple enum knowledge is better verified through skills that use test types rather than isolated recall" + }, + { + "capability": "result-type-usage", + "complexity": "basic", + "reasoning": "Already covered by the skills tile eval (scenario-2). Testing Result type usage is better done through implementation tasks" + } + ] +}