diff --git a/app/config.py b/app/config.py index 1637be9..9509310 100644 --- a/app/config.py +++ b/app/config.py @@ -16,6 +16,7 @@ class Settings(BaseSettings): cache_s3_prefix: str = "explain-cache/" cache_ttl: str = "2d" # HTTP Cache-Control max-age (e.g., "2d", "48h", "172800s") cache_ttl_seconds: int = 172800 # Computed from cache_ttl for Cache-Control header + log_level: str = "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR) model_config = SettingsConfigDict(env_file=".env") @field_validator("cache_ttl_seconds", mode="before") diff --git a/app/explain.py b/app/explain.py index 1a68f93..35fff6c 100644 --- a/app/explain.py +++ b/app/explain.py @@ -80,8 +80,17 @@ async def _call_anthropic_api( # Generate messages using the Prompt instance prompt_data = prompt.generate_messages(body) + # Debug logging for prompts + LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===") + LOGGER.debug("=== SYSTEM PROMPT ===") + LOGGER.debug(prompt_data["system"]) + LOGGER.debug("=== MESSAGES ===") + for message in prompt_data["messages"]: + LOGGER.debug(message) + LOGGER.debug("=== END PROMPT DEBUG ===") + # Call Claude API - LOGGER.info(f"Using Anthropic client with model: {prompt_data['model']}") + LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]}) message = client.messages.create( model=prompt_data["model"], diff --git a/app/explanation_types.py b/app/explanation_types.py index bc0fdcb..e76f1f5 100644 --- a/app/explanation_types.py +++ b/app/explanation_types.py @@ -19,3 +19,4 @@ class ExplanationType(str, Enum): """Type of explanation to generate.""" ASSEMBLY = "assembly" + HAIKU = "haiku" diff --git a/app/main.py b/app/main.py index ed330f5..1fa2681 100644 --- a/app/main.py +++ b/app/main.py @@ -1,9 +1,10 @@ import logging +from contextlib import asynccontextmanager from pathlib import Path from anthropic import Anthropic from anthropic import __version__ as anthropic_version -from fastapi import FastAPI +from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from mangum import Mangum @@ -20,11 +21,47 @@ from app.metrics import get_metrics_provider from app.prompt import Prompt -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() -logger.setLevel(logging.INFO) -app = FastAPI(root_path=get_settings().root_path) +def configure_logging(log_level: str) -> None: + """Configure logging with the specified level.""" + level = getattr(logging, log_level.upper(), logging.INFO) + logging.basicConfig( + level=level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + force=True, # Reconfigure if already configured + ) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Configure app on startup, cleanup on shutdown.""" + # Startup + settings = get_settings() + configure_logging(settings.log_level) + logger = logging.getLogger(__name__) + + # Store shared resources in app.state + app.state.settings = settings + app.state.anthropic_client = Anthropic(api_key=settings.anthropic_api_key) + + # Load the prompt configuration + prompt_config_path = Path(__file__).parent / "prompt.yaml" + app.state.prompt = Prompt(prompt_config_path) + + logger.info(f"Application started with log level: {settings.log_level}") + logger.info(f"Anthropic SDK version: {anthropic_version}") + logger.info(f"Loaded prompt configuration from {prompt_config_path}") + + yield + + # Shutdown + logger.info("Application shutting down") + + +# Get settings once for app-level configuration +# This is acceptable since these settings don't change during runtime +_app_settings = get_settings() +app = FastAPI(root_path=_app_settings.root_path, lifespan=lifespan) # Configure CORS - allows all origins for public API app.add_middleware( @@ -37,18 +74,10 @@ ) handler = Mangum(app) -anthropic_client = Anthropic(api_key=get_settings().anthropic_api_key) -logger.info(f"Anthropic SDK version: {anthropic_version}") - -# Load the prompt configuration -prompt_config_path = Path(__file__).parent / "prompt.yaml" -prompt = Prompt(prompt_config_path) -logger.info(f"Loaded prompt configuration from {prompt_config_path}") - -def get_cache_provider(): +def get_cache_provider(settings) -> NoOpCacheProvider | S3CacheProvider: """Get the configured cache provider.""" - settings = get_settings() + logger = logging.getLogger(__name__) if not settings.cache_enabled: logger.info("Caching disabled by configuration") @@ -70,8 +99,9 @@ def get_cache_provider(): @app.get("/", response_model=AvailableOptions) -async def get_options() -> AvailableOptions: +async def get_options(request: Request) -> AvailableOptions: """Get available options for the explain API.""" + prompt = request.app.state.prompt async with get_metrics_provider() as metrics_provider: metrics_provider.put_metric("ClaudeExplainOptionsRequest", 1) return AvailableOptions( @@ -93,8 +123,14 @@ async def get_options() -> AvailableOptions: @app.post("/") -async def explain(request: ExplainRequest) -> ExplainResponse: +async def explain(explain_request: ExplainRequest, request: Request) -> ExplainResponse: """Explain a Compiler Explorer compilation from its source and output assembly.""" async with get_metrics_provider() as metrics_provider: - cache_provider = get_cache_provider() - return await process_request(request, anthropic_client, prompt, metrics_provider, cache_provider) + cache_provider = get_cache_provider(request.app.state.settings) + return await process_request( + explain_request, + request.app.state.anthropic_client, + request.app.state.prompt, + metrics_provider, + cache_provider, + ) diff --git a/app/prompt.py b/app/prompt.py index 2f73d13..8680d5c 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -46,14 +46,52 @@ def __init__(self, config: dict[str, Any] | Path): self.audience_levels = self.config["audience_levels"] self.explanation_types = self.config["explanation_types"] - def get_audience_metadata(self, audience: str) -> dict[str, str]: - """Get metadata for an audience level.""" - return self.audience_levels[audience] + def get_audience_metadata(self, audience: str, for_explanation: str | None = None) -> dict[str, str]: + """Get metadata for an audience level (and optionally an explanation type).""" + return self.get_audience_metadata_from_dict(self.config, audience, for_explanation) def get_explanation_metadata(self, explanation: str) -> dict[str, str]: """Get metadata for an explanation type.""" return self.explanation_types[explanation] + @classmethod + def get_audience_metadata_from_dict( + cls, prompt_dict: dict[str, Any], audience: str, for_explanation: str | None = None + ) -> dict[str, str]: + """Get audience metadata from a prompt dict structure (for use by prompt_advisor). + + This is a class method version of get_audience_metadata that works with + raw prompt dictionaries instead of Prompt instances. + """ + if "audience_levels" not in prompt_dict: + return {} + + audience_metadata = prompt_dict["audience_levels"].get(audience, {}) + + if for_explanation and "explanation_types" in prompt_dict: + explanation_config = prompt_dict["explanation_types"].get(for_explanation, {}) + if "audience_levels" in explanation_config: + explanation_audience = explanation_config["audience_levels"].get(audience, {}) + if explanation_audience: + # Merge base audience metadata with explanation-specific overrides + audience_metadata = {**audience_metadata, **explanation_audience} + + return audience_metadata + + @classmethod + def has_audience_override(cls, prompt_dict: dict[str, Any], explanation: str, audience: str) -> bool: + """Check if an explanation type has audience-specific overrides.""" + return ( + "explanation_types" in prompt_dict + and explanation in prompt_dict["explanation_types"] + and "audience_levels" in prompt_dict["explanation_types"][explanation] + and audience in prompt_dict["explanation_types"][explanation]["audience_levels"] + ) + + # Note: In the future, prompt_advisor may need the ability to create new + # explanation-specific audience overrides (like we did manually for haiku). + # This would involve adding new audience_levels sections within explanation_types. + def select_important_assembly( self, asm_array: list[dict], label_definitions: dict, max_lines: int = MAX_ASSEMBLY_LINES ) -> list[dict]: @@ -192,34 +230,23 @@ def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: - structured_data: The prepared data (for reference/debugging) """ # Get metadata - audience_meta = self.get_audience_metadata(request.audience.value) explanation_meta = self.get_explanation_metadata(request.explanation.value) - - # Prepare structured data + audience_meta = self.get_audience_metadata(request.audience.value, request.explanation.value) structured_data = self.prepare_structured_data(request) - # Format the system prompt - arch = request.instruction_set_with_default - system_prompt = self.system_prompt_template.format( - arch=arch, - language=request.language, - audience=request.audience.value, - audience_guidance=audience_meta["guidance"], - explanation_type=request.explanation.value, - explanation_focus=explanation_meta["focus"], - ) - - # Format the user prompt - user_prompt = self.user_prompt_template.format( - arch=arch, - user_prompt_phrase=explanation_meta["user_prompt_phrase"], - ) - - # Format the assistant prefill - assistant_prefill = self.assistant_prefill.format( - user_prompt_phrase=explanation_meta["user_prompt_phrase"], - audience=request.audience.value, - ) + prompt_dictionary = { + "arch": request.instruction_set_with_default, + "audience_guidance": audience_meta["guidance"], + "audience": request.audience.value, + "explanation_focus": explanation_meta["focus"], + "explanation_type": request.explanation.value, + "language": request.language, + "user_prompt_phrase": explanation_meta["user_prompt_phrase"], + } + # Format the prompts and prefill + system_prompt = self.system_prompt_template.format(**prompt_dictionary) + user_prompt = self.user_prompt_template.format(**prompt_dictionary) + assistant_prefill = self.assistant_prefill.format(**prompt_dictionary) # Build messages array messages = [ diff --git a/app/prompt.yaml b/app/prompt.yaml index 0860bec..a5dcaee 100644 --- a/app/prompt.yaml +++ b/app/prompt.yaml @@ -1,96 +1,92 @@ -name: Production v9 -description: Confident compilation options, undefined behavior handling, - conciseness and calibrated confidence +name: Matt fiddlings based on production 9 +description: Added Haiku and attempted to clean up a bunch of things model: name: claude-3-5-haiku-20241022 max_tokens: 1024 temperature: 0.0 audience_levels: beginner: - description: For beginners learning assembly language. Uses simple language - and explains technical terms. + description: For beginners learning assembly language. Uses simple language and explains technical terms. guidance: | - Use simple, clear language. Define technical terms when first used. - Explain concepts step-by-step. Avoid overwhelming with too many details at once. - Use analogies where helpful to explain complex concepts. - When explaining register usage, explicitly mention calling conventions (e.g., 'By convention, register X is used for...'). + - Include foundational concepts about assembly basics, register purposes, and memory organization. When function calls or parameter handling appear in the assembly, explain the calling convention patterns being used and why specific registers are chosen. + - Use simple, clear language. Define technical terms inline when first used (e.g., 'vectorization means processing multiple data elements simultaneously'). + - Explain concepts step-by-step. Avoid overwhelming with too many details at once. + - Use analogies where helpful to explain complex concepts. + - When explaining register usage, explicitly mention calling conventions (e.g., 'By convention, register X is used for...'). experienced: - description: For users familiar with assembly concepts and compiler - behavior. Focuses on optimizations and technical details. + description: For users familiar with assembly concepts and compiler behavior. Focuses on optimizations and technical details. guidance: | - Assume familiarity with basic assembly concepts and common instructions. - Use technical terminology appropriately but explain advanced concepts when relevant. - Focus on the 'why' behind compiler choices, optimizations, and microarchitectural details. - Explain performance implications, trade-offs, and edge cases. - When analyzing assembly code, verify instruction behavior by understanding inputs, operations, and outputs. Be especially careful with multi-operand instructions. Only discuss optimization levels when clear from the code patterns. - When discussing compiler optimizations, distinguish between: constant folding, dead code elimination, register allocation, instruction selection, loop optimizations, and inlining. Explain which specific optimizations are present or absent. - Discuss performance characteristics at the CPU pipeline level when relevant. + - Focus on optimization reasoning and architectural trade-offs. Explain not just what the compiler did, but why it made those choices and what alternatives existed. Discuss how different code patterns lead to different assembly outcomes, and provide insights that help developers write more compiler-friendly code. Include performance implications, practical considerations for real-world usage, and microarchitectural details when relevant. + - Assume familiarity with basic assembly concepts and common instructions. + - Use technical terminology appropriately but explain advanced concepts when relevant. + - Focus on the 'why' behind compiler choices, optimizations, and microarchitectural details. + - Explain performance implications, trade-offs, and edge cases. + - When analyzing assembly code, verify instruction behavior by understanding inputs, operations, and outputs. Be especially careful with multi-operand instructions. Only discuss optimization levels when clear from the code patterns. + - When discussing compiler optimizations, distinguish between: constant folding, dead code elimination, register allocation, instruction selection, loop optimizations, and inlining. Explain which specific optimizations are present or absent. + - Discuss performance characteristics at the CPU pipeline level when relevant. explanation_types: assembly: description: Explains the assembly instructions and their purpose. focus: | - Focus on explaining the assembly instructions and their purpose. - Group related instructions together and explain their collective function. - Highlight important patterns like calling conventions, stack management, and control flow. - When explaining register usage, explicitly mention calling conventions (e.g., 'By convention, register X is used for...'). + - Structure explanations by leading with the single most important insight or pattern first, then build supporting details around it. + - Focus on explaining the assembly instructions and their purpose. + - Group related instructions together and explain their collective function. + - Highlight important patterns like calling conventions, stack management, and control flow. + - When explaining register usage, explicitly mention calling conventions (e.g., 'By convention, register X is used for...'). + - Focus on the most illuminating aspects of the assembly code. Structure explanations by leading with the single most important insight or pattern first, then build supporting details around it. Ask yourself: 'What's the one thing this audience most needs to understand about this assembly?' Start there, then add context and details. Lead with the key concept or optimization pattern, then provide supporting details as needed. Use backticks around technical terms, instruction names, and specific values (e.g., `mov`, `rax`, `0x42`) to improve readability. When relevant, explain stack frame setup decisions and when compilers choose registers over stack storage. When optimization choices create notable patterns in the assembly, discuss what optimizations appear to be applied and their implications. For any code where it adds insight, compare the shown assembly with what other optimization levels (-O0, -O1, -O2, -O3) would produce, explaining specific optimizations present or missing. When showing unoptimized code, describe what optimized versions would look like and why those optimizations improve performance. When analyzing unoptimized code and it's relevant, identify missed optimization opportunities and explain what optimized assembly would look like. For optimized code, explain the specific optimizations applied and their trade-offs. + - Keep explanations concise and focused on the most important insights. Aim for explanations that are shorter than or comparable in length to the assembly code being analyzed. In summary sections (like "Key Observations"), prioritize the most essential points rather than providing comprehensive coverage. Avoid lengthy explanations that exceed the complexity of the code itself. + - When relevant, compare the generated assembly with what other optimization levels or architectures might produce + - Structure explanations to lead with key insights rather than comprehensive coverage. Ask yourself: what's the most valuable thing for this audience to understand about this assembly? + + user_prompt_phrase: assembly output + haiku: + description: Tries to capture the essence of the code as a haiku. + focus: | + Focus on the overall behavior and intent of the code. Use vivid imagery and concise language to convey meaning. + Highlight key actions and their significance. Stick to the form of a three-line haiku. + Produce no other output than the haiku itself. user_prompt_phrase: assembly output + audience_levels: + beginner: + guidance: + experienced: + guidance: system_prompt: | - You are an expert in {arch} assembly code and {language}, helping users of the - Compiler Explorer website understand how their code compiles to assembly. - The request will be in the form of a JSON document, which explains a source program and how it was compiled, - and the resulting assembly code that was generated. + You are an expert in {arch} assembly code and {language}, helping users of the Compiler Explorer website understand how their code compiles to assembly. + The request will be in the form of a JSON document, which explains a source program and how it was compiled, and the resulting assembly code that was generated. - Target audience: {audience} - {audience_guidance} + ## Overall guidelines: - For beginners: Include foundational concepts about assembly basics, register purposes, and memory organization. When function calls or parameter handling appear in the assembly, explain the calling convention patterns being used and why specific registers are chosen. - For experienced: Focus on optimization reasoning and architectural trade-offs. Explain not just what the compiler did, but why it made those choices and what alternatives existed. Discuss how different code patterns lead to different assembly outcomes, and provide insights that help developers write more compiler-friendly code. Include performance implications, practical considerations for real-world usage, and microarchitectural details when relevant. + Use these guidelines as appropriate. The user's request is more important than these; if the user prompt asks for a specific output type, ensure you stick to that. To the extent you need to explain things, use these guidelines. - Explanation type: {explanation_type} - {explanation_focus} - - Guidelines: - - Compilation Options Interpretation: When analyzing assembly code, confidently interpret compiler behavior based on the compilation options provided. If compilation options are empty or contain no optimization/debug flags, this definitively means compiler defaults (unoptimized code with standard settings). State this confidently: "This is unoptimized code" - never use tentative language like "likely -O0", "appears to be", or "probably unoptimized". The absence of optimization flags is definitive information, not speculation. When explicit flags are present (like -O1, -O2, -g, -march=native), reference them directly and explain their specific effects on the assembly output. - - Undefined Behavior Handling: When analyzing code that contains undefined behavior (such as multiple modifications of the same variable in a single expression, data races, or other language-undefined constructs), recognize this and adjust your explanation approach. Instead of trying to definitively map assembly instructions to specific source operations, explain that the behavior is undefined and the compiler was free to implement it in any way. Describe what the compiler chose to do as "one possible implementation" or "the compiler's chosen approach" rather than claiming it's the correct or expected mapping. Focus on the educational value by explaining why such code is problematic and should be avoided, while still walking through what the generated assembly actually does. - - Conciseness and Focus: Keep explanations concise and focused on the most important insights. Aim for explanations that are shorter than or comparable in length to the assembly code being analyzed. In summary sections (like "Key Observations"), prioritize the most essential points rather than providing comprehensive coverage. Avoid lengthy explanations that exceed the complexity of the code itself. - - Confidence Calibration: Be definitive about what can be directly observed in the assembly code (instruction behavior, register usage, memory operations). Be appropriately cautious about inferring purposes, reasons, or design decisions without clear evidence. Avoid making definitive claims about efficiency, performance characteristics, or optimization strategies unless they can be clearly substantiated from the visible code patterns. When comparing to other optimization levels, only do so when directly relevant to understanding the current assembly code. - - Focus on the most illuminating aspects of the assembly code. Structure explanations by leading with the single most important insight or pattern first, then build supporting details around it. Ask yourself: 'What's the one thing this audience most needs to understand about this assembly?' Start there, then add context and details. Lead with the key concept or optimization pattern, then provide supporting details as needed. Use backticks around technical terms, instruction names, and specific values (e.g., `mov`, `rax`, `0x42`) to improve readability. When relevant, explain stack frame setup decisions and when compilers choose registers over stack storage. When optimization choices create notable patterns in the assembly, discuss what optimizations appear to be applied and their implications. For any code where it adds insight, compare the shown assembly with what other optimization levels (-O0, -O1, -O2, -O3) would produce, explaining specific optimizations present or missing. When showing unoptimized code, describe what optimized versions would look like and why those optimizations improve performance. When analyzing unoptimized code and it's relevant, identify missed optimization opportunities and explain what optimized assembly would look like. For optimized code, explain the specific optimizations applied and their trade-offs. + - When analyzing assembly code, confidently interpret compiler behavior based on the compilation options provided. If compilation options are empty or contain no optimization/debug flags, this definitively means compiler defaults (unoptimized code with standard settings). State this confidently: "This is unoptimized code" - never use tentative language like "likely -O0", "appears to be", or "probably unoptimized". The absence of optimization flags is definitive information, not speculation. When explicit flags are present (like -O1, -O2, -g, -march=native), reference them directly and explain their specific effects on the assembly output. + - When analyzing code that contains undefined behavior (such as multiple modifications of the same variable in a single expression, data races, or other language-undefined constructs), recognize this and adjust your explanation approach. Instead of trying to definitively map assembly instructions to specific source operations, explain that the behavior is undefined and the compiler was free to implement it in any way. Describe what the compiler chose to do as "one possible implementation" or "the compiler's chosen approach" rather than claiming it's the correct or expected mapping. Focus on the educational value by explaining why such code is problematic and should be avoided, while still walking through what the generated assembly actually does. + - Be definitive about what can be directly observed in the assembly code (instruction behavior, register usage, memory operations). Be appropriately cautious about inferring purposes, reasons, or design decisions without clear evidence. Avoid making definitive claims about efficiency, performance characteristics, or optimization strategies unless they can be clearly substantiated from the visible code patterns. When comparing to other optimization levels, only do so when directly relevant to understanding the current assembly code. - Unless requested, give no commentary on the original source code itself - assume the user understands their input - Reference source code only when it helps explain the assembly mapping - - Do not provide an overall conclusion or summary - Be precise and accurate about CPU features and optimizations. Before explaining any instruction's behavior, trace through its inputs and outputs step-by-step to verify correctness. For multi-operand instructions, explicitly identify which operand is the source and which is the destination. Pay special attention to instructions like `lea` (Load Effective Address) - verify whether they perform memory access or just address calculation, as this is a common source of confusion. Double-check all register modifications and mathematical operations by working through the values. When discussing optimization patterns, describe what you observe in the code based on the compilation options provided. If compilation options indicate unoptimized code (empty options or no optimization flags), state this definitively: 'This is unoptimized code' and explain the observable characteristics. (e.g., 'single-cycle' operations) unless you can verify them for the specific architecture. Before explaining what an instruction does, carefully verify its actual behavior - trace through each instruction's inputs and outputs step by step. Qualify performance statements with appropriate caveats (e.g., 'typically', 'on most modern processors', 'depending on the specific CPU'). Double-check mathematical operations and register modifications. - - Avoid incorrect claims about hardware details like branch prediction + - Avoid incorrect claims about hardware details like branch prediction, cache performance, CPU pipelining etc. - When analyzing code, accurately characterize the optimization level shown. Don't claim code is 'optimal' or 'efficient' when it's clearly unoptimized. Distinguish between different optimization strategies (unrolling, tail recursion elimination, etc.) and explain the trade-offs. When showing unoptimized code, explicitly state "This is unoptimized code" without tentative qualifiers, and explain what optimizations are missing and why they would help. - For mathematical operations, verify each step by tracing register values through the instruction sequence - When there are notable performance trade-offs or optimization opportunities, discuss their practical impact. Explain why certain instruction choices are made (e.g., lea vs add, imul vs shift+add), discuss stack vs register storage decisions, and provide practical insights about writing compiler-friendly code when these insights would be valuable. For unoptimized code with significant performance issues, quantify the performance cost and explain what optimizations would address it. - - When relevant, compare the generated assembly with what other optimization levels or architectures might produce - - If the optimization level can be inferred from the assembly patterns and is relevant to understanding the code, mention it in context and compare with other levels when it adds insight. - - Weave calling convention details (parameter passing, register usage, stack vs register decisions) into the explanation where they illuminate the assembly's behavior. - When discussing performance, use qualified language ('typically', 'on most processors') rather than absolute claims. - - When analyzing unoptimized code, explain why the compiler made seemingly inefficient choices (like unnecessary stack operations for simple functions) and what optimizations would eliminate these patterns. Help readers understand the difference between 'correct but inefficient' and 'optimized' assembly. + - When analyzing unoptimized code, explicitly state 'This is unoptimized code' early and identify specific redundancies (like store-then-load patterns).. Explain why the compiler made seemingly inefficient choices (like unnecessary stack operations for simple functions) and what optimizations would eliminate these patterns. Help readers understand the difference between 'correct but inefficient' and 'optimized' assembly. + - When analyzing simple functions that use stack operations unnecessarily, explain why unoptimized compilers make these choices and what the optimized version would look like. - Provide practical insights that help developers understand how to write more compiler-friendly code. - - - # Additional guidance from analysis: - When analyzing assembly code, verify instruction behavior carefully by understanding inputs, operations, and outputs. Be especially careful with multi-operand instructions like imul and lea. Only make claims about optimization levels when they can be clearly determined from the code patterns. - When explaining register usage patterns that might confuse the reader, clarify the roles of different registers, including parameter passing, return values, and caller/callee-saved conventions where relevant. - When discussing compiler optimizations, distinguish between: constant folding, dead code elimination, register allocation, instruction selection, loop optimizations, and inlining. Explain which specific optimizations are present or absent. - - - # Additional guidance from analysis: - - When analyzing simple functions that use stack operations unnecessarily, explain why unoptimized compilers make these choices and what the optimized version would look like. - - Structure explanations to lead with key insights rather than comprehensive coverage. Ask yourself: what's the most valuable thing for this audience to understand about this assembly? - - - # Additional guidance from analysis: - Use backticks around technical terms, instruction names, and specific values (e.g., `mov`, `rax`, `0x42`) to improve readability. - Pay special attention to instructions like `lea` (Load Effective Address) - verify whether they perform memory access or just address calculation, as this is a common source of confusion. - - Structure explanations by leading with the single most important insight or pattern first, then build supporting details around it. + - **Do not provide an overall conclusion or summary** +user_prompt: | + Explain the {arch} {user_prompt_phrase}. + + ## Target audience: {audience} + {audience_guidance} + + ## Explanation type: {explanation_type} + {explanation_focus} - # Additional guidance from analysis: - - When analyzing unoptimized code, explicitly state 'This is unoptimized code' early and identify specific redundancies (like store-then-load patterns). - - For beginner audiences, define technical terms inline when first used (e.g., 'vectorization means processing multiple data elements simultaneously'). -user_prompt: Explain the {arch} {user_prompt_phrase}. -assistant_prefill: "I'll analyze the {user_prompt_phrase} and explain it for {audience} - level:" +assistant_prefill: "I'll analyze the {user_prompt_phrase} and explain it for {audience} level:" diff --git a/app/test_explain.py b/app/test_explain.py index 9d76573..e9a52a4 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -116,11 +116,15 @@ async def test_process_request_success(self, sample_request, mock_anthropic_clie # Verify the system prompt contains appropriate instructions system_prompt = kwargs["system"] - assert "beginner" in system_prompt.lower() assert "assembly" in system_prompt.lower() assert "c++" in system_prompt.lower() assert "amd64" in system_prompt.lower() + # Check that audience information is in the user prompt (messages) + messages = kwargs["messages"] + user_message = messages[0]["content"][0]["text"] + assert "beginner" in user_message.lower() + # Check that the messages array contains user and assistant messages messages = kwargs["messages"] assert len(messages) == 2 diff --git a/claude_explain.md b/claude_explain.md index cc04651..6e35426 100644 --- a/claude_explain.md +++ b/claude_explain.md @@ -31,7 +31,8 @@ This dual-mode design provides excellent developer experience while maintaining 3. **Claude Integration** - Uses Claude 3.5 Haiku model for improved accuracy - Structured JSON input preserving source-to-assembly mappings - - Configurable audience levels and explanation types + - Configurable audience levels (beginner, experienced) and explanation types (assembly, haiku) + - Flexible prompt composition with explanation-type overrides for audience guidance - Token usage and cost metrics 4. **Infrastructure** (when deployed to AWS) @@ -54,6 +55,52 @@ This dual-mode design provides excellent developer experience while maintaining - Cost breakdown - Model information +## Explanation Types and Audience Configuration + +The service supports multiple explanation types with flexible audience targeting: + +### Explanation Types + +1. **Assembly Explanations** (`assembly`) + - Detailed technical explanations of the compiled assembly code + - Focuses on instruction behavior, compiler optimizations, and performance analysis + - Uses full audience-specific guidance for appropriate depth and terminology + +2. **Haiku Explanations** (`haiku`) + - Poetic 3-line summaries capturing the essence of the code's behavior + - Uses vivid imagery and concise language to convey meaning + - Minimal audience differentiation - both beginners and experienced users receive the same poetic treatment + +### Audience Levels + +- **Beginner**: Simple language, foundational concepts, calling convention explanations +- **Experienced**: Technical terminology, optimization focus, advanced microarchitectural details + +### Flexible Prompt Composition + +The prompt system supports explanation-type specific audience overrides: + +- **Base audience guidance**: Defined in `audience_levels` section of prompt configuration +- **Type-specific overrides**: Explanation types can override audience guidance via nested `audience_levels` sections +- **Template variable system**: All prompt components (system, user, assistant prefill) use shared template variables + +Example configuration structure: +```yaml +audience_levels: + beginner: + guidance: "Base guidance for beginners..." + +explanation_types: + haiku: + audience_levels: + beginner: + guidance: "" # Override to disable assembly-specific guidance + experienced: + guidance: "" # Same minimal guidance for both audiences +``` + +This system allows haiku explanations to bypass technical assembly guidance while assembly explanations retain full audience differentiation. + ## Prompt Testing Framework A comprehensive prompt testing system has been developed to: @@ -65,11 +112,11 @@ A comprehensive prompt testing system has been developed to: - Ensure consistent quality across various code examples ### Architecture -- **Test Cases**: YAML-based test scenarios with expected topics -- **Prompts**: Versioned prompt templates with variable substitution +- **Test Cases**: YAML-based test scenarios with expected topics, including haiku-specific test cases +- **Prompts**: Versioned prompt templates with flexible variable substitution and explanation-type overrides - **Scoring**: Multiple evaluation methods: - Automatic scoring based on heuristics - - Claude-based evaluation for accuracy and clarity + - Claude-based evaluation with explanation-type specific criteria (assembly vs haiku) - Human review interface for manual assessment - **Enrichment**: Integration with Compiler Explorer API to fetch real assembly data diff --git a/prompt_testing/evaluation/claude_reviewer.py b/prompt_testing/evaluation/claude_reviewer.py index 5c7a312..ce86caf 100644 --- a/prompt_testing/evaluation/claude_reviewer.py +++ b/prompt_testing/evaluation/claude_reviewer.py @@ -97,6 +97,8 @@ class ReviewCriteria: _EXPLANATION_TYPE = { ExplanationType.ASSEMBLY: """The explanation should be predominantly about the compiled assembly.""", + ExplanationType.HAIKU: """The explanation should be in the form of a haiku, + capturing the essence of the code's behavior in a poetic way.""", } @@ -126,6 +128,15 @@ def _build_evaluation_prompt( ) -> str: """Build the evaluation prompt for Claude.""" + # Use the same criteria for all explanation types + criteria = { + "accuracy": self.criteria.accuracy, + "relevance": self.criteria.relevance, + "conciseness": self.criteria.conciseness, + "insight": self.criteria.insight, + "appropriateness": self.criteria.appropriateness, + } + prompt = f"""You are an expert in compiler technology and technical education. Your task is to evaluate an AI-generated explanation of Compiler Explorer's output using our metrics. @@ -160,24 +171,24 @@ def _build_evaluation_prompt( Test case description: {test_case.get("description", "No description provided")} -## NEW METRICS SYSTEM +## METRICS SYSTEM Evaluate the explanation on these 5 dimensions: 1. **Accuracy (0-100)** -{self.criteria.accuracy} +{criteria["accuracy"]} 2. **Relevance (0-100)** -{self.criteria.relevance} +{criteria["relevance"]} 3. **Conciseness (0-100)** -{self.criteria.conciseness} +{criteria["conciseness"]} 4. **Insight (0-100)** -{self.criteria.insight} +{criteria["insight"]} 5. **Appropriateness (0-100)** -{self.criteria.appropriateness} +{criteria["appropriateness"]} """ diff --git a/prompt_testing/evaluation/prompt_advisor.py b/prompt_testing/evaluation/prompt_advisor.py index 31b8d14..d240b08 100644 --- a/prompt_testing/evaluation/prompt_advisor.py +++ b/prompt_testing/evaluation/prompt_advisor.py @@ -622,16 +622,32 @@ def _apply_targeted_improvement(self, new_prompt: dict[str, Any], improvement: d ) and "system_prompt" in new_prompt: new_prompt["system_prompt"] = new_prompt["system_prompt"].replace(current_text, suggested_text) - # Apply to specific audience levels - if targets["audiences"] and "audience_levels" in new_prompt: + # Apply to specific audience levels (check both base and explanation-specific locations) + # TODO: In the future, we may need to create new explanation-specific audience overrides + if targets["audiences"]: for audience in targets["audiences"]: - if audience in new_prompt["audience_levels"]: + # Check base audience level + if "audience_levels" in new_prompt and audience in new_prompt["audience_levels"]: guidance = new_prompt["audience_levels"][audience].get("guidance", "") if current_text in guidance: new_prompt["audience_levels"][audience]["guidance"] = guidance.replace( current_text, suggested_text ) + # Check explanation-specific audience overrides + if "explanation_types" in new_prompt: + for exp_config in new_prompt["explanation_types"].values(): + if ( + isinstance(exp_config, dict) + and "audience_levels" in exp_config + and audience in exp_config["audience_levels"] + ): + guidance = exp_config["audience_levels"][audience].get("guidance", "") + if current_text in guidance: + exp_config["audience_levels"][audience]["guidance"] = guidance.replace( + current_text, suggested_text + ) + # Apply to specific explanation types if targets["explanation_types"] and "explanation_types" in new_prompt: for exp_type in targets["explanation_types"]: @@ -648,16 +664,31 @@ def _apply_targeted_additions(self, new_prompt: dict[str, Any], additions: list[ targets = self._classify_suggestion_target(addition) applied = False - # Apply to specific audience levels - if targets["audiences"] and "audience_levels" in new_prompt: + # Apply to specific audience levels (check both base and explanation-specific locations) + if targets["audiences"]: for audience in targets["audiences"]: - if audience in new_prompt["audience_levels"]: + # Check base audience level + if "audience_levels" in new_prompt and audience in new_prompt["audience_levels"]: current_guidance = new_prompt["audience_levels"][audience].get("guidance", "") new_prompt["audience_levels"][audience]["guidance"] = ( current_guidance.rstrip() + f"\n{addition}\n" ) applied = True + # Check explanation-specific audience overrides + if "explanation_types" in new_prompt: + for exp_config in new_prompt["explanation_types"].values(): + if ( + isinstance(exp_config, dict) + and "audience_levels" in exp_config + and audience in exp_config["audience_levels"] + ): + current_guidance = exp_config["audience_levels"][audience].get("guidance", "") + exp_config["audience_levels"][audience]["guidance"] = ( + current_guidance.rstrip() + f"\n{addition}\n" + ) + applied = True + # Apply to specific explanation types if targets["explanation_types"] and "explanation_types" in new_prompt: for exp_type in targets["explanation_types"]: diff --git a/prompt_testing/test_cases/haiku_tests.yaml b/prompt_testing/test_cases/haiku_tests.yaml new file mode 100644 index 0000000..64730eb --- /dev/null +++ b/prompt_testing/test_cases/haiku_tests.yaml @@ -0,0 +1,331 @@ +description: Test cases for haiku explanation type across different code complexities +cases: +- id: factorial_beginner_haiku + category: recursion + quality: good_example + description: Simple recursive factorial function for haiku generation + audience: beginner + explanation_type: haiku + input: + language: C++ + compiler: x86-64 gcc 12.2 + compilationOptions: + - -O2 + instructionSet: x86_64 + code: | + int factorial(int n) { + if (n <= 1) return 1; + return n * factorial(n - 1); + } + asm: + - text: 'factorial(int):' + - text: ' cmp edi, 1' + source: + line: 2 + - text: ' jle .L4' + source: + line: 2 + labels: + - name: .L4 + range: + startCol: 17 + endCol: 20 + - text: ' push rbx' + source: + line: 3 + - text: ' mov ebx, edi' + source: + line: 3 + - text: ' lea edi, [rdi-1]' + source: + line: 3 + - text: ' call factorial(int)' + source: + line: 3 + - text: ' imul eax, ebx' + source: + line: 3 + - text: ' pop rbx' + source: + line: 3 + - text: ' ret' + source: + line: 3 + - text: '.L4:' + - text: ' mov eax, 1' + source: + line: 2 + - text: ' ret' + source: + line: 2 + labelDefinitions: + factorial(int): 0 + .L4: 9 + +- id: factorial_experienced_haiku + category: recursion + quality: good_example + description: Same factorial but for experienced audience - should produce same haiku + audience: experienced + explanation_type: haiku + input: + language: C++ + compiler: x86-64 gcc 12.2 + compilationOptions: + - -O2 + instructionSet: x86_64 + code: | + int factorial(int n) { + if (n <= 1) return 1; + return n * factorial(n - 1); + } + asm: + - text: 'factorial(int):' + - text: ' cmp edi, 1' + source: + line: 2 + - text: ' jle .L4' + source: + line: 2 + labels: + - name: .L4 + range: + startCol: 17 + endCol: 20 + - text: ' push rbx' + source: + line: 3 + - text: ' mov ebx, edi' + source: + line: 3 + - text: ' lea edi, [rdi-1]' + source: + line: 3 + - text: ' call factorial(int)' + source: + line: 3 + - text: ' imul eax, ebx' + source: + line: 3 + - text: ' pop rbx' + source: + line: 3 + - text: ' ret' + source: + line: 3 + - text: '.L4:' + - text: ' mov eax, 1' + source: + line: 2 + - text: ' ret' + source: + line: 2 + labelDefinitions: + factorial(int): 0 + .L4: 9 + +- id: simple_loop_haiku + category: loops + quality: good_example + description: Simple loop for haiku generation + audience: beginner + explanation_type: haiku + input: + language: C++ + compiler: x86-64 gcc 12.2 + compilationOptions: + - -O2 + instructionSet: x86_64 + code: | + int sum_array(int* arr, int n) { + int sum = 0; + for (int i = 0; i < n; i++) { + sum += arr[i]; + } + return sum; + } + asm: + - text: 'sum_array(int*, int):' + - text: ' test esi, esi' + source: + line: 3 + - text: ' jle .L4' + source: + line: 3 + - text: ' lea eax, [rsi-1]' + source: + line: 3 + - text: ' lea rdx, [rdi+4+rax*4]' + source: + line: 4 + - text: ' xor eax, eax' + source: + line: 2 + - text: '.L3:' + source: + line: 3 + - text: ' add eax, DWORD PTR [rdi]' + source: + line: 4 + - text: ' add rdi, 4' + source: + line: 3 + - text: ' cmp rdi, rdx' + source: + line: 3 + - text: ' jne .L3' + source: + line: 3 + - text: ' ret' + source: + line: 6 + - text: '.L4:' + - text: ' xor eax, eax' + source: + line: 2 + - text: ' ret' + source: + line: 6 + labelDefinitions: + sum_array(int*, int): 0 + .L3: 6 + .L4: 12 + +- id: empty_function_haiku + category: edge_cases + quality: edge_case + description: Empty function edge case for haiku + audience: beginner + explanation_type: haiku + input: + language: C++ + compiler: x86-64 gcc 12.2 + compilationOptions: + - -O2 + instructionSet: x86_64 + code: | + void empty() { + } + asm: + - text: 'empty():' + - text: ' ret' + source: + line: 2 + labelDefinitions: + empty(): 0 + +- id: complex_sorting_haiku + category: algorithms + quality: challenging + description: More complex algorithm for haiku creativity test + audience: experienced + explanation_type: haiku + input: + language: C++ + compiler: x86-64 gcc 12.2 + compilationOptions: + - -O2 + instructionSet: x86_64 + code: | + void bubble_sort(int* arr, int n) { + for (int i = 0; i < n-1; i++) { + for (int j = 0; j < n-i-1; j++) { + if (arr[j] > arr[j+1]) { + int temp = arr[j]; + arr[j] = arr[j+1]; + arr[j+1] = temp; + } + } + } + } + asm: + - text: 'bubble_sort(int*, int):' + - text: ' cmp esi, 2' + source: + line: 2 + - text: ' jle .L1' + source: + line: 2 + - text: ' push r12' + - text: ' lea r10d, [rsi-1]' + source: + line: 2 + - text: ' push rbp' + - text: ' push rbx' + - text: ' mov rbx, rdi' + - text: ' xor r12d, r12d' + source: + line: 2 + - text: '.L6:' + - text: ' mov eax, r10d' + source: + line: 3 + - text: ' sub eax, r12d' + source: + line: 3 + - text: ' test eax, eax' + source: + line: 3 + - text: ' jle .L3' + source: + line: 3 + - text: ' mov rdx, rbx' + source: + line: 3 + - text: ' lea ecx, [rax-1]' + source: + line: 3 + - text: ' lea rdi, [rbx+4+rcx*4]' + source: + line: 3 + - text: '.L4:' + - text: ' mov eax, DWORD PTR [rdx]' + source: + line: 4 + - text: ' cmp eax, DWORD PTR [rdx+4]' + source: + line: 4 + - text: ' jle .L5' + source: + line: 4 + - text: ' mov ecx, DWORD PTR [rdx+4]' + source: + line: 5 + - text: ' mov DWORD PTR [rdx], ecx' + source: + line: 6 + - text: ' mov DWORD PTR [rdx+4], eax' + source: + line: 7 + - text: '.L5:' + - text: ' add rdx, 4' + source: + line: 3 + - text: ' cmp rdx, rdi' + source: + line: 3 + - text: ' jne .L4' + source: + line: 3 + - text: '.L3:' + - text: ' add r12d, 1' + source: + line: 2 + - text: ' cmp r12d, r10d' + source: + line: 2 + - text: ' jne .L6' + source: + line: 2 + - text: ' pop rbx' + - text: ' pop rbp' + - text: ' pop r12' + - text: '.L1:' + - text: ' ret' + labelDefinitions: + bubble_sort(int*, int): 0 + .L6: 8 + .L4: 16 + .L5: 22 + .L3: 25 + .L1: 31