From cd514f54a5275fcfe6484e9ce44abbc29bcfbfe0 Mon Sep 17 00:00:00 2001 From: praneeth_paikray-data Date: Tue, 24 Mar 2026 20:31:39 +0530 Subject: [PATCH] Add skill authoring workflow and CI quality checks (#338, #330) Evaluated Anthropic's skill-creator and cherry-picked its conversational authoring workflow for ai-dev-kit contributors. The existing .test/ framework already outperforms skill-creator on evaluation and optimization, so we focused on the one gap: a guided skill creation experience. What's included: - Contributor authoring skill (.skill-authoring/) with 6-phase workflow adapted from Anthropic's skill-creator (Apache 2.0, attributed) - Reference docs for skill format and test format specifications - Quick trigger validation script (.test/scripts/quick_trigger.py) - CI quality warnings in validate_skills.py (non-blocking): missing "Use when" triggers, body over 500 lines, untagged code blocks, broken reference file links - Upgraded TEMPLATE from 55-line placeholder to realistic template - Expanded CONTRIBUTING.md with quality checklist and full workflow - Added "Developing Skills" section to databricks-skills/README.md - Fixed "Use when" triggers for databricks-ai-functions and databricks-dbsql Co-authored-by: Isaac --- .github/scripts/validate_skills.py | 76 +++- .skill-authoring/SKILL.md | 414 ++++++++++++++++++ .skill-authoring/install_skill_authoring.sh | 25 ++ .skill-authoring/references/skill-format.md | 156 +++++++ .skill-authoring/references/test-format.md | 263 +++++++++++ .test/SKILL.md | 2 + .test/scripts/quick_trigger.py | 226 ++++++++++ CONTRIBUTING.md | 98 ++++- databricks-skills/README.md | 31 ++ databricks-skills/TEMPLATE/SKILL.md | 67 ++- databricks-skills/TEMPLATE/example_file1.md | 68 ++- databricks-skills/TEMPLATE/example_file2.md | 72 +-- .../databricks-ai-functions/SKILL.md | 2 +- databricks-skills/databricks-dbsql/SKILL.md | 1 + 14 files changed, 1418 insertions(+), 83 deletions(-) create mode 100644 .skill-authoring/SKILL.md create mode 100755 .skill-authoring/install_skill_authoring.sh create mode 100644 .skill-authoring/references/skill-format.md create mode 100644 .skill-authoring/references/test-format.md create mode 100644 .test/scripts/quick_trigger.py diff --git a/.github/scripts/validate_skills.py b/.github/scripts/validate_skills.py index e3bc38e4..29af99da 100644 --- a/.github/scripts/validate_skills.py +++ b/.github/scripts/validate_skills.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Validate skill structure and frontmatter. -Checks: +Checks (errors — block PRs): 1. Every skill directory has a SKILL.md file 2. SKILL.md has valid YAML frontmatter per best practices: - name: required, ≤64 chars, lowercase letters/numbers/hyphens only, @@ -9,6 +9,12 @@ - description: required, non-empty, ≤1024 chars, no XML tags 3. Local skill directories are registered in install_skills.sh (skill-list variables are auto-discovered, not hardcoded) + +Quality warnings (non-blocking): +4. Description should contain "Use when" trigger phrases +5. SKILL.md body should be under 500 lines (use reference files for overflow) +6. Code blocks should have language tags +7. Referenced files (markdown links) should exist """ import re @@ -26,6 +32,62 @@ XML_TAG_RE = re.compile(r"<[^>]+>") +CODE_BLOCK_RE = re.compile(r"^```(\w*)$", re.MULTILINE) +MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") +MAX_BODY_LINES = 500 + + +def quality_warnings(skill_dir: Path, content: str, frontmatter: dict) -> list[str]: + """Run non-blocking quality checks and return warnings.""" + warnings = [] + + # Check description contains "Use when" trigger phrases + desc = str(frontmatter.get("description", "")) + if desc and "use when" not in desc.lower(): + warnings.append( + f'{skill_dir.name}: description lacks "Use when" trigger phrases ' + f"(helps Claude decide when to activate the skill)" + ) + + # Check body length (lines below frontmatter) + body = re.sub(r"^---\n.+?\n---\n?", "", content, count=1, flags=re.DOTALL) + body_lines = len(body.strip().splitlines()) + if body_lines > MAX_BODY_LINES: + warnings.append( + f"{skill_dir.name}: SKILL.md body is {body_lines} lines " + f"(>{MAX_BODY_LINES}). Consider moving content to reference files." + ) + + # Check code blocks have language tags + # Every pair of ``` markers forms a block; even-indexed matches (0,2,4..) + # are opening markers, odd-indexed are closing markers. + all_fences = list(CODE_BLOCK_RE.finditer(content)) + opening_fences = [all_fences[i] for i in range(0, len(all_fences), 2)] + untagged = sum(1 for m in opening_fences if not m.group(1)) + if untagged > 0: + warnings.append( + f"{skill_dir.name}: {untagged} code block(s) missing language tags " + f"(use ```python, ```sql, ```yaml, etc.)" + ) + + # Check referenced markdown files exist + for match in MD_LINK_RE.finditer(content): + link_target = match.group(2) + # Only check relative .md links (not URLs, not anchors) + if ( + not link_target.startswith("http") + and not link_target.startswith("#") + and link_target.endswith(".md") + ): + ref_path = skill_dir / link_target + if not ref_path.exists(): + warnings.append( + f"{skill_dir.name}: referenced file '{link_target}' not found" + ) + + return warnings + + def parse_frontmatter(content: str) -> dict | None: """Extract YAML frontmatter from markdown content.""" match = re.match(r"^---\n(.+?)\n---", content, re.DOTALL) @@ -117,6 +179,7 @@ def get_local_skill_dirs() -> set[str]: def main() -> int: errors: list[str] = [] + warnings: list[str] = [] actual_skills = get_local_skill_dirs() # --- Validate each skill directory's SKILL.md and frontmatter --- @@ -153,6 +216,10 @@ def main() -> int: for err in validate_description(str(frontmatter["description"])): errors.append(f"{skill_dir.name}: {err}") + # Quality warnings (non-blocking) + for warn in quality_warnings(skill_dir, content, frontmatter): + warnings.append(warn) + # --- Cross-reference with install_skills.sh --- install_content = INSTALL_SCRIPT.read_text() skill_vars, composite_vars = parse_skill_variables(install_content) @@ -182,6 +249,13 @@ def main() -> int: errors.append(f"Skills in {var_name} but no directory found: {sorted(missing)}") # --- Report --- + # Surface warnings (non-blocking) before errors + if warnings: + print(f"Quality warnings ({len(warnings)}):\n") + for warning in warnings: + print(f"::warning::{warning}") + print() + if errors: print("Skill validation failed:\n") for error in errors: diff --git a/.skill-authoring/SKILL.md b/.skill-authoring/SKILL.md new file mode 100644 index 00000000..be298b25 --- /dev/null +++ b/.skill-authoring/SKILL.md @@ -0,0 +1,414 @@ +--- +name: skill-authoring +description: "Guided workflow for creating new Databricks skills for ai-dev-kit. Use when a contributor wants to create a new skill, draft a SKILL.md, generate test cases, or improve an existing skill's structure. Triggers on 'create skill', 'new skill', 'author skill', 'draft skill', 'skill template', or 'write a skill for'." +--- + +# Databricks Skill Authoring Guide + +Conversational workflow for creating high-quality Databricks skills in ai-dev-kit. Adapted from [Anthropic's skill-creator](https://github.com/anthropics/skills/tree/main/skills/skill-creator) (Apache 2.0) for ai-dev-kit conventions. + +## References + +- [Skill Format Specification](references/skill-format.md) — Frontmatter rules, progressive disclosure, section conventions +- [Test Format Specification](references/test-format.md) — ground_truth.yaml and manifest.yaml schemas + +## Workflow Overview + +Follow these phases in order. Do not skip phases — each builds on the previous. + +``` +Phase 1: Capture Intent ──► Phase 2: Interview & Research ──► Phase 3: Write SKILL.md + │ │ + │ ▼ + │ Phase 4: Write Test Cases + │ │ + │ ▼ + │ Phase 5: Validate & Register + │ │ + │ ▼ + └──────────────────────────────────────────────────── Phase 6: Evaluate & Improve +``` + +--- + +## Phase 1: Capture Intent + +Ask the contributor these questions before writing anything: + +1. **What Databricks feature or domain does this skill cover?** + - e.g., "Unity Catalog metric views", "Lakebase autoscaling", "Model Serving endpoints" + +2. **Who is the target user?** + - Data engineers, data scientists, ML engineers, analysts, app developers? + +3. **What are 3-5 specific tasks a user would ask Claude to do with this skill?** + - These become the seed for test cases later. + +4. **Does this overlap with an existing skill?** + - Check `databricks-skills/` for related skills. If overlap exists, clarify the boundary. + +5. **What language(s) should code examples use?** + - Python, SQL, YAML (for bundles), or a mix? + +**Output**: A short summary paragraph capturing the skill's purpose, audience, and scope. Save this — it becomes the seed for the description. + +--- + +## Phase 2: Interview & Research + +Dig deeper into the domain. Ask the contributor (or research from Databricks docs): + +### Domain Questions +- What are the **critical rules** a user must always follow? (e.g., "always use serverless", "never use deprecated DLT syntax") +- What are the **common mistakes** or deprecated patterns to avoid? +- What **API versions or SDK methods** are relevant? Verify they exist in the current Databricks SDK. +- Are there **multiple approaches** to the same task? (e.g., Python SDK vs. SQL vs. CLI) — document the recommended one first. + +### Scope Questions +- What should this skill **NOT** cover? (Explicit exclusions prevent scope creep) +- Are there **prerequisite skills** the user should know about? (e.g., databricks-unity-catalog for permissions) +- Should the skill reference any **MCP tools** from `databricks-mcp-server/`? If so, verify the tool names exist as `@mcp.tool` functions. + +### Structure Questions +- Is the content **small enough for a single SKILL.md** (<500 lines)? +- If not, what content should go into **reference files**? (Deep API references, exhaustive parameter lists, migration guides) + +**Output**: A structured outline of sections and reference files. + +--- + +## Phase 3: Write SKILL.md + +Create the skill directory and SKILL.md following ai-dev-kit conventions. + +### 3.1 Directory Structure + +``` +databricks-skills/{skill-name}/ +├── SKILL.md # Required — main skill file +├── {reference1}.md # Optional — deep reference content +├── {reference2}.md # Optional — additional patterns +└── ... +``` + +The directory name MUST match the `name` field in frontmatter. + +### 3.2 Frontmatter + +```yaml +--- +name: {skill-name} +description: "{One paragraph, max 1024 chars. Must include 'Use when' trigger phrases. Be specific and pushy — Claude tends to under-trigger skills, so make the description assertive about when to activate.}" +--- +``` + +**Frontmatter rules** (enforced by CI): +- `name`: Required. Lowercase letters, numbers, hyphens only. Max 64 chars. Must not contain "anthropic" or "claude". +- `description`: Required. Non-empty. Max 1024 chars. No XML tags. Must contain "Use when" with specific trigger scenarios. + +**Writing effective descriptions:** +- Lead with what the skill does: "Creates, configures, and manages X..." +- Include explicit trigger phrases: "Use when building X, working with Y, or when the user mentions Z." +- List specific keywords that should activate the skill +- Be assertive — "Use when" not "Can be used when" +- Example from a good skill: + ``` + "Patterns for Databricks Vector Search: create endpoints and indexes, query with filters, manage embeddings. Use when building RAG applications, semantic search, or similarity matching. Covers both storage-optimized and standard endpoints." + ``` + +### 3.3 Progressive Disclosure Architecture + +Structure content in three tiers: + +| Tier | What | When Loaded | Budget | +|------|------|-------------|--------| +| **Metadata** | name + description in frontmatter | Always in context | ~100 words | +| **Body** | SKILL.md content below frontmatter | When skill triggers | <500 lines ideal | +| **References** | Separate .md files | When Claude reads them | Unlimited | + +**Key principle**: Keep SKILL.md lean. Move deep reference material (exhaustive API docs, parameter lists, migration guides) into reference files. The body should contain enough to handle 80% of requests; reference files cover the remaining 20%. + +### 3.4 Recommended Body Sections + +Based on patterns from the best existing skills: + +```markdown +# {Skill Title} + +{One-paragraph summary + critical rules if any} + +## When to Use + +Use this skill when: +- {Specific scenario 1} +- {Specific scenario 2} +- {Specific scenario 3} + +## Overview + +{Component table or conceptual summary — help the user understand the landscape before diving into code} + +## Quick Start + +{The simplest, most common use case. Working code example the user can copy.} + +## Common Patterns + +### Pattern 1: {Descriptive Name} +{Code example + brief explanation} + +### Pattern 2: {Descriptive Name} +{Code example + brief explanation} + +## Reference Files + +- [{reference1}.md]({reference1}.md) - {What this covers} +- [{reference2}.md]({reference2}.md) - {What this covers} + +## Common Issues + +| Issue | Solution | +|-------|----------| +| **{Problem}** | {Fix} | +| **{Problem}** | {Fix} | +``` + +**Section guidelines:** +- **Critical Rules**: If the domain has rules that MUST always be followed (like "always use serverless"), put them right after the title, before any other section. Use bold and imperative language. +- **When to Use**: Reinforces the description triggers. Helps Claude decide if the skill applies. +- **Overview**: Tables work well for comparing options (endpoint types, index types, SDK methods). +- **Quick Start**: Must be a complete, working example — not pseudocode. This is the most important section. +- **Common Patterns**: 2-5 patterns covering the most frequent use cases. Each should have a real code example. +- **Reference Files**: Only include if SKILL.md would exceed ~400 lines without them. +- **Common Issues**: Known gotchas, error messages, and their fixes. + +### 3.5 Reference File Conventions + +Reference files follow the same markdown format without frontmatter: + +```markdown +# {Pattern/Topic Name} + +## When to Use +{Specific scenario this reference covers} + +## Code Example +{Working code with explanations} + +## Explanation +{Why this approach, tradeoffs, alternatives} + +## Common Variations +{Modifications for different scenarios} +``` + +### 3.6 Code Example Standards + +- All code blocks MUST have a language tag (```python, ```sql, ```yaml, ```bash) +- Use realistic values, not "foo/bar" placeholders +- Include imports and setup — examples should be copy-pasteable +- Use current APIs only — no deprecated patterns (e.g., `@dp.table` not `@dlt.table`) +- Prefer the Databricks SDK (`from databricks.sdk import WorkspaceClient`) for Python examples + +--- + +## Phase 4: Write Test Cases + +Generate test scaffolding using the existing test framework. + +### 4.1 Initialize Test Scaffolding + +Run: +```bash +uv run python .test/scripts/init_skill.py {skill-name} +``` + +Or use the `/skill-test` command: +``` +/skill-test {skill-name} init +``` + +This creates the test directory at `.test/skills/{skill-name}/` with template files. + +### 4.2 Write Ground Truth + +Create `.test/skills/{skill-name}/ground_truth.yaml` with 3-5 test cases. Use the tasks from Phase 1 as seeds. + +See [Test Format Specification](references/test-format.md) for the full schema. + +**Test case guidelines:** +- Include at least one "happy path" (straightforward use case) +- Include at least one "edge case" (unusual input, boundary condition) +- Include at least one that tests the skill's boundaries (what it should NOT do) +- Use realistic prompts — write them as a user would naturally phrase the request +- Expected responses should be complete and correct — they become the reference standard + +### 4.3 Write Routing Tests + +Add entries to `.test/skills/_routing/ground_truth.yaml`: + +```yaml +# Should trigger +- id: "routing_{skill-name}_001" + inputs: + prompt: "{A prompt that should trigger this skill}" + expectations: + expected_skills: ["{skill-name}"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "{Why this should trigger the skill}" + +# Should NOT trigger +- id: "routing_{skill-name}_neg_001" + inputs: + prompt: "{A prompt that sounds related but should NOT trigger}" + expectations: + expected_skills: [] + is_multi_skill: false + metadata: + category: "no_match" + difficulty: "medium" + reasoning: "{Why this should NOT trigger despite seeming related}" +``` + +Include at minimum: +- 3 should-trigger prompts (easy, medium, hard) +- 2 should-NOT-trigger prompts (plausible false positives) + +### 4.4 Configure Manifest + +Edit `.test/skills/{skill-name}/manifest.yaml`: + +```yaml +skill: + name: "{skill-name}" + source_path: "databricks-skills/{skill-name}" + description: "{Short description}" + +evaluation: + datasets: + - path: ground_truth.yaml + type: yaml + + scorers: + tier1: + - python_syntax # If skill produces Python + - sql_syntax # If skill produces SQL + - pattern_adherence + - no_hallucinated_apis + tier2: + - code_executes # If execution can be verified + tier3: + - Guidelines + + quality_gates: + tier1_pass_rate: 1.0 + tier2_pass_rate: 0.8 + tier3_pass_rate: 0.85 +``` + +Remove scorers that don't apply (e.g., drop `sql_syntax` if the skill only produces Python). + +--- + +## Phase 5: Validate & Register + +### 5.1 Run CI Validation + +```bash +python .github/scripts/validate_skills.py +``` + +This checks: +- SKILL.md exists with valid frontmatter +- name/description meet constraints +- Skill is registered in install_skills.sh + +### 5.2 Register in Install Script + +Add the skill name to the appropriate variable in `databricks-skills/install_skills.sh`: + +1. Add to `DATABRICKS_SKILLS` (the main skill list) +2. Add to the appropriate profile in `install.sh` and `install.ps1` (data-engineer, analyst, ai-ml-engineer, or app-developer) +3. If the skill has extra files beyond SKILL.md, add entries to the `get_skill_extra_files()` function + +### 5.3 Update README + +Add the skill to the skills table in `databricks-skills/README.md`. + +--- + +## Phase 6: Evaluate & Improve + +Use the existing test framework to measure and improve quality. + +### 6.1 Run Evaluation + +```bash +# Quick evaluation against ground truth +uv run python .test/scripts/run_eval.py {skill-name} + +# Full MLflow evaluation with LLM judges +uv run python .test/scripts/mlflow_eval.py {skill-name} + +# Routing accuracy check +uv run python .test/scripts/routing_eval.py _routing +``` + +### 6.2 Save Baseline + +Once results are acceptable: +```bash +uv run python .test/scripts/baseline.py {skill-name} +``` + +### 6.3 Optimize (Advanced) + +For description optimization and skill improvement, use the GEPA framework: +```bash +uv run python .test/scripts/optimize.py {skill-name} --preset quick +``` + +Presets: `quick` (15 iterations), `standard` (50), `thorough` (150). + +### 6.4 Iteration Loop + +After evaluation, improve the skill by: +1. **Reviewing failures**: Read the evaluation output for failed test cases +2. **Identifying patterns**: Are failures concentrated in one area (e.g., SQL syntax, missing facts)? +3. **Updating SKILL.md**: Add missing patterns, fix incorrect examples, clarify ambiguous instructions +4. **Re-running evaluation**: Verify improvements and check for regressions + +Repeat until quality gates are met: +- Tier 1 (syntax/patterns): 100% pass rate +- Tier 2 (execution): 80% pass rate +- Tier 3 (LLM judge): 85% pass rate + +--- + +## Checklist + +Before submitting a PR, verify: + +- [ ] `SKILL.md` has valid frontmatter (name, description with "Use when" triggers) +- [ ] Description is assertive and includes specific trigger keywords +- [ ] Body is under 500 lines (reference files used for overflow) +- [ ] All code blocks have language tags +- [ ] Code examples use current APIs (no deprecated patterns) +- [ ] Quick Start example is complete and copy-pasteable +- [ ] Common Issues section covers known gotchas +- [ ] Test scaffolding exists at `.test/skills/{skill-name}/` +- [ ] At least 3 ground truth test cases written +- [ ] At least 5 routing test cases (3 positive, 2 negative) added +- [ ] `validate_skills.py` passes +- [ ] Skill registered in `install_skills.sh` +- [ ] Skill added to appropriate profile in `install.sh`/`install.ps1` +- [ ] Skills table updated in `databricks-skills/README.md` + +--- + +## Attribution + +This skill's authoring workflow is adapted from [Anthropic's skill-creator](https://github.com/anthropics/skills/tree/main/skills/skill-creator), licensed under Apache License 2.0. diff --git a/.skill-authoring/install_skill_authoring.sh b/.skill-authoring/install_skill_authoring.sh new file mode 100755 index 00000000..0b03e384 --- /dev/null +++ b/.skill-authoring/install_skill_authoring.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Install/sync skill-authoring skill to .claude/skills/ +# +# The source of truth is .skill-authoring/ (this directory). +# This script copies files to .claude/skills/skill-authoring/ so Claude Code +# picks them up automatically when contributors clone the repo. +# +# Usage: +# bash .skill-authoring/install_skill_authoring.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TARGET_DIR="$REPO_ROOT/.claude/skills/skill-authoring" + +mkdir -p "$TARGET_DIR/references" + +echo "Syncing skill-authoring to $TARGET_DIR ..." + +cp "$SCRIPT_DIR/SKILL.md" "$TARGET_DIR/SKILL.md" +cp "$SCRIPT_DIR/references/skill-format.md" "$TARGET_DIR/references/skill-format.md" +cp "$SCRIPT_DIR/references/test-format.md" "$TARGET_DIR/references/test-format.md" + +echo "Done. skill-authoring skill installed at $TARGET_DIR" diff --git a/.skill-authoring/references/skill-format.md b/.skill-authoring/references/skill-format.md new file mode 100644 index 00000000..69437b34 --- /dev/null +++ b/.skill-authoring/references/skill-format.md @@ -0,0 +1,156 @@ +# Skill Format Specification + +Complete reference for ai-dev-kit skill format, frontmatter rules, and structural conventions. + +## Directory Layout + +``` +databricks-skills/{skill-name}/ +├── SKILL.md # Required — main skill instructions +├── {reference}.md # Optional — deep reference content +└── ... # Additional reference files as needed +``` + +- Directory name must exactly match the `name` field in SKILL.md frontmatter +- Use lowercase letters, numbers, and hyphens only +- Prefix with `databricks-` for Databricks platform skills (convention, not enforced) + +## Frontmatter Specification + +SKILL.md must begin with YAML frontmatter delimited by `---`: + +```yaml +--- +name: "{skill-name}" +description: "{skill-description}" +--- +``` + +### `name` (required) + +| Constraint | Rule | +|------------|------| +| Format | Lowercase letters, numbers, hyphens only (`^[a-z0-9]+(-[a-z0-9]+)*$`) | +| Max length | 64 characters | +| Reserved words | Must not contain "anthropic" or "claude" | +| XML tags | Must not contain XML/HTML tags | +| Match | Must match the directory name | + +### `description` (required) + +| Constraint | Rule | +|------------|------| +| Min length | Non-empty | +| Max length | 1024 characters | +| XML tags | Must not contain XML/HTML tags | +| Trigger phrases | Should contain "Use when" with specific scenarios | + +**Writing effective descriptions:** + +The description is the primary mechanism Claude uses to decide whether to activate a skill. It is always loaded in context (~100 words). Make it count. + +1. **Lead with action**: "Creates, configures, and manages..." +2. **Include explicit triggers**: "Use when building X, working with Y, or when the user mentions Z." +3. **List keywords**: Include domain-specific terms that should activate the skill +4. **Be assertive**: "Use when" not "Can be used when" — Claude under-triggers by default + +**Good example:** +``` +"Creates, configures, and updates Databricks Lakeflow Spark Declarative Pipelines (SDP/LDP) +using serverless compute. Handles streaming tables, materialized views, CDC, SCD Type 2, and +Auto Loader ingestion patterns. Use when building data pipelines, working with Delta Live Tables, +ingesting streaming data, implementing change data capture, or when the user mentions SDP, LDP, +DLT, Lakeflow pipelines, streaming tables, or bronze/silver/gold medallion architectures." +``` + +**Bad example:** +``` +"A skill for Databricks pipelines." +``` + +## Progressive Disclosure Model + +Content is structured in three tiers to manage context window usage: + +| Tier | Location | When Loaded | Budget | +|------|----------|-------------|--------| +| **1. Metadata** | Frontmatter (name + description) | Always in context | ~100 words | +| **2. Body** | SKILL.md content below frontmatter | When skill triggers | <500 lines | +| **3. References** | Separate .md files in same directory | When Claude reads them on demand | Unlimited | + +### Tier 1: Metadata +- Always in Claude's context window +- Must be information-dense — every word earns its place +- The description is the skill's "elevator pitch" AND its routing signal + +### Tier 2: Body (SKILL.md) +- Loaded when the skill triggers based on description match +- Target: under 500 lines for the full body +- Should handle 80% of user requests without needing reference files +- Include working code examples, common patterns, and critical rules + +### Tier 3: References +- Loaded only when Claude specifically reads them (via `Read` tool) +- Use for: exhaustive API references, parameter lists, migration guides, advanced patterns +- No line limit, but keep individual files focused on one topic +- Reference them from SKILL.md: `See [API Reference](api-reference.md) for details.` + +## Body Section Conventions + +Based on analysis of the best existing skills (databricks-vector-search, databricks-spark-declarative-pipelines, databricks-python-sdk): + +### Recommended Sections (in order) + +1. **Title** (`# {Skill Name}`) — Brief summary paragraph +2. **Critical Rules** — Only if the domain has rules that MUST always be followed. Use bold, imperative language. Place before all other sections. +3. **When to Use** — Bulleted list of specific scenarios. Reinforces description triggers. +4. **Overview** — Conceptual summary. Tables work well for comparing options (types, methods, approaches). +5. **Quick Start** — The simplest, most common use case. Complete, working, copy-pasteable code. +6. **Common Patterns** — 2-5 patterns for frequent use cases. Each has a heading, code example, and brief explanation. +7. **Reference Files** — Links to reference .md files with descriptions of what each covers. +8. **Common Issues** — Table of known gotchas, error messages, and fixes. + +### Section Guidelines + +- **Critical Rules**: Not every skill needs this. Only include when there are hard requirements (e.g., "MUST use serverless", "NEVER use deprecated syntax"). Example from databricks-spark-declarative-pipelines: + ```markdown + ## Critical Rules (always follow) + - **MUST** confirm language as Python or SQL + - **MUST** create serverless pipelines by default + ``` + +- **Quick Start**: This is the most important section. It should be a single, complete example that works out of the box. Include imports, setup, and realistic values. + +- **Common Patterns**: Each pattern should have: + - A descriptive heading (`### Pattern 2: Hybrid Search with Filters`) + - Working code (with language tag) + - 1-2 sentences explaining when/why to use this pattern + +- **Common Issues**: Use a table format for scannability: + ```markdown + | Issue | Solution | + |-------|----------| + | **`PERMISSION_DENIED` on endpoint** | Grant `USE ENDPOINT` via Unity Catalog | + ``` + +## Code Example Standards + +- All code blocks MUST have a language tag: ```python, ```sql, ```yaml, ```bash +- Use realistic values (catalog names, table names, paths) — not "foo", "bar", "example" +- Include imports and setup code — examples should be copy-pasteable +- Use current Databricks APIs only: + - `@dp.table` not `@dlt.table` + - `CLUSTER BY` not `PARTITION BY` + - `mlflow.genai.evaluate` not `mlflow.evaluate` + - `from databricks.sdk import WorkspaceClient` for SDK examples +- Show the most common/recommended approach first, alternatives second + +## Registration + +After creating a skill, it must be registered in three places: + +1. **`databricks-skills/install_skills.sh`**: Add to `DATABRICKS_SKILLS` variable +2. **`install.sh` / `install.ps1`**: Add to the appropriate profile (data-engineer, analyst, ai-ml-engineer, app-developer) +3. **`databricks-skills/README.md`**: Add to the skills table + +The CI validator (`validate_skills.py`) cross-references directories against `install_skills.sh` and will fail if a skill directory exists without registration. diff --git a/.skill-authoring/references/test-format.md b/.skill-authoring/references/test-format.md new file mode 100644 index 00000000..07c75f55 --- /dev/null +++ b/.skill-authoring/references/test-format.md @@ -0,0 +1,263 @@ +# Test Format Specification + +Reference for ai-dev-kit's skill testing formats: `ground_truth.yaml`, `manifest.yaml`, and routing tests. + +All test files live at the repository root under `.test/skills/`, not relative to the skill directory. + +## File Locations + +``` +.test/skills/ +├── _routing/ +│ └── ground_truth.yaml # Routing tests (shared across all skills) +├── {skill-name}/ +│ ├── ground_truth.yaml # Skill-specific test cases +│ ├── candidates.yaml # Pending test cases for review +│ └── manifest.yaml # Evaluation configuration +``` + +## Manifest (`manifest.yaml`) + +Defines skill metadata, evaluation datasets, scorers, and quality gates. + +```yaml +skill: + name: "{skill-name}" # Must match directory name + source_path: "databricks-skills/{skill-name}" # Path to skill source + description: "{Short description of what's being tested}" + +evaluation: + datasets: + - path: ground_truth.yaml + type: yaml + + scorers: + tier1: # Deterministic (fast, run first) + - python_syntax # AST parsing of Python code blocks + - sql_syntax # Structural SQL validation + - pattern_adherence # Regex pattern matching + - no_hallucinated_apis # Check for deprecated/incorrect APIs + tier2: # Execution-based + - code_executes # Validates generated code runs + tier3: # LLM Judge (slowest, run last) + - Guidelines # Semantic evaluation against guidelines + + quality_gates: + tier1_pass_rate: 1.0 # 100% — syntax and patterns must always pass + tier2_pass_rate: 0.8 # 80% — some execution failures acceptable + tier3_pass_rate: 0.85 # 85% — LLM judge threshold +``` + +### Available Scorers + +**Tier 1 (Deterministic):** +| Scorer | Input | What It Checks | +|--------|-------|----------------| +| `python_syntax` | `outputs.response` | All Python code blocks parse via AST | +| `sql_syntax` | `outputs.response` | SQL statements are structurally valid | +| `pattern_adherence` | `outputs.response` + `expectations.expected_patterns` | Required regex patterns present | +| `no_hallucinated_apis` | `outputs.response` | No deprecated APIs (`@dlt.table`, `PARTITION BY`, etc.) | +| `expected_facts_present` | `outputs.response` + `expectations.expected_facts` | Required facts mentioned | + +**Tier 2 (Execution):** +| Scorer | Input | What It Checks | +|--------|-------|----------------| +| `code_executes` | `outputs.execution_success` | Generated code runs successfully | + +**Tier 3 (LLM Judge):** +| Scorer | Input | What It Checks | +|--------|-------|----------------| +| `Guidelines` | `expectations.guidelines` | Semantic adherence judged by LLM | + +**Choose scorers based on your skill:** +- Python-only skill: include `python_syntax`, drop `sql_syntax` +- SQL-only skill: include `sql_syntax`, drop `python_syntax` +- Both: include both +- Always include: `pattern_adherence`, `no_hallucinated_apis` +- Include `Guidelines` for nuanced quality checks + +## Ground Truth (`ground_truth.yaml`) + +Test cases with inputs, expected outputs, expectations, and metadata. + +```yaml +test_cases: + - id: "{skill-name}_{category}_{number}" # Unique identifier + + inputs: + prompt: | + {Natural language prompt as a user would write it. + Be specific and realistic — avoid generic "do X" prompts.} + + outputs: + response: | + {Complete expected response including code blocks, explanations, + and any other content the skill should produce.} + execution_success: true # Optional: did the code run? + + expectations: + expected_facts: # Tier 1: must be mentioned + - "Uses STREAMING TABLE for incremental ingestion" + - "Includes CLUSTER BY instead of PARTITION BY" + + expected_patterns: # Tier 1: regex patterns + - pattern: "CREATE OR REFRESH STREAMING TABLE" + min_count: 1 + - pattern: "CLUSTER BY" + min_count: 1 + - pattern: "PARTITION BY" # Negative check + max_count: 0 + min_count: 0 + + guidelines: # Tier 3: LLM judge criteria + - "Must use modern SDP syntax, not legacy DLT" + - "Should include metadata columns for lineage" + + metadata: + category: "happy_path" # happy_path | edge_case | error_handling | boundary + difficulty: "easy" # easy | medium | hard + source: "manual" # manual | generated | trace + tags: ["bronze", "ingestion"] # Searchable tags +``` + +### Test Case Design Guidelines + +**Minimum: 3-5 test cases per skill:** + +1. **Happy path (easy)** — The most common, straightforward use case. If a user asks the simplest possible question about this skill, does it respond correctly? + +2. **Happy path (medium)** — A realistic use case with some complexity (multiple parameters, configuration choices). + +3. **Edge case** — Unusual input, boundary condition, or a request that tests the limits of the skill's knowledge. + +4. **Error/boundary case** — Something the skill should gracefully handle or refuse (e.g., a request for a deprecated pattern). + +5. **Multi-step (optional)** — A complex request requiring the skill to produce multiple code blocks or make architectural decisions. + +**Writing good prompts:** +- Write as a real user would — natural language, not formal +- Include context: "I have a Delta table at catalog.schema.table with columns x, y, z..." +- Be specific enough that there's a clear "correct" answer +- Vary complexity across test cases + +**Writing good expected responses:** +- Must be complete and correct — these are the gold standard +- Include all code blocks with language tags +- Use current APIs (not deprecated ones) +- Include brief explanations where a user would expect them + +### Pattern Specification + +Patterns support both simple strings and structured objects: + +```yaml +expected_patterns: + # Simple: just a regex string (min_count defaults to 1) + - "WorkspaceClient" + + # Structured: full control + - pattern: "CREATE OR REFRESH STREAMING TABLE" + min_count: 1 # Minimum matches required (default: 1) + max_count: 3 # Maximum matches allowed (optional) + description: "Uses streaming table syntax" + + # Negative check: pattern must NOT appear + - pattern: "@dlt\\.table" + min_count: 0 + max_count: 0 + description: "Must not use deprecated DLT syntax" +``` + +## Routing Tests (`_routing/ground_truth.yaml`) + +Shared across all skills. Tests whether prompts trigger the correct skill(s). + +```yaml +test_cases: + # Should trigger (single skill) + - id: "routing_{skill-name}_001" + inputs: + prompt: "{Prompt that should clearly trigger this skill}" + expectations: + expected_skills: ["{skill-name}"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "{Why this should trigger — which keyword/phrase matches}" + + # Should trigger (multi-skill) + - id: "routing_{skill-name}_multi_001" + inputs: + prompt: "{Prompt that should trigger this skill AND another}" + expectations: + expected_skills: ["{skill-name}", "{other-skill}"] + is_multi_skill: true + metadata: + category: "multi_skill" + difficulty: "medium" + reasoning: "{Why both skills should activate}" + + # Should NOT trigger (false positive guard) + - id: "routing_{skill-name}_neg_001" + inputs: + prompt: "{Prompt that sounds related but should NOT trigger}" + expectations: + expected_skills: [] + is_multi_skill: false + metadata: + category: "no_match" + difficulty: "medium" + reasoning: "{Why this should NOT trigger despite seeming related}" +``` + +### Routing Test Guidelines + +**Minimum per skill: 5 routing test cases:** +- 3 should-trigger (easy, medium, hard difficulty) +- 2 should-NOT-trigger (plausible false positives) + +**Difficulty levels:** +- **Easy**: Contains explicit keywords from the skill description (e.g., "streaming table" for SDP) +- **Medium**: Uses domain language without exact keywords (e.g., "incremental ingestion pipeline") +- **Hard**: Ambiguous prompt that could match multiple skills but should match this one + +**False positive guards:** +- Prompts that use related terminology but belong to a different skill +- Generic prompts that shouldn't trigger any skill (e.g., "What's the weather?") + +## Initializing Test Scaffolding + +Use the test framework to create template files: + +```bash +# Via script +uv run python .test/scripts/init_skill.py {skill-name} + +# Via slash command +/skill-test {skill-name} init +``` + +This creates: +- `.test/skills/{skill-name}/manifest.yaml` — with default scorers +- `.test/skills/{skill-name}/ground_truth.yaml` — empty template +- `.test/skills/{skill-name}/candidates.yaml` — empty template + +## Running Tests + +```bash +# Quick evaluation +uv run python .test/scripts/run_eval.py {skill-name} + +# MLflow evaluation with LLM judges +uv run python .test/scripts/mlflow_eval.py {skill-name} + +# Routing accuracy +uv run python .test/scripts/routing_eval.py _routing + +# Compare against baseline +uv run python .test/scripts/regression.py {skill-name} + +# Save current results as baseline +uv run python .test/scripts/baseline.py {skill-name} +``` diff --git a/.test/SKILL.md b/.test/SKILL.md index 9ba42e10..a9790856 100644 --- a/.test/SKILL.md +++ b/.test/SKILL.md @@ -44,6 +44,7 @@ The `/skill-test` command provides an interactive CLI for testing Databricks ski | `list-traces` | List available traces (MLflow or local) | | `scorers` | List configured scorers for a skill | | `scorers update` | Add/remove scorers or update default guidelines | +| `quick-trigger` | Quick smoke test: does the skill trigger on expected prompts? | | `sync` | Sync YAML to Unity Catalog (Phase 2) | ### Quick Examples @@ -92,6 +93,7 @@ uv run python .test/scripts/{subcommand}.py {skill_name} [options] | `sync` | `sync.py` | | `trace-eval` | `trace_eval.py` | | `list-traces` | `list_traces.py` | +| `quick-trigger` | `quick_trigger.py` | | `_routing mlflow` | `routing_eval.py` | Use `--help` on any script for available options. diff --git a/.test/scripts/quick_trigger.py b/.test/scripts/quick_trigger.py new file mode 100644 index 00000000..7dfbf950 --- /dev/null +++ b/.test/scripts/quick_trigger.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""Quick trigger validation for skills. + +Tests whether a skill triggers correctly by running prompts through +`claude -p` and checking if the skill name appears in the output. +This is a lightweight smoke test for development — use routing_eval.py +for comprehensive routing evaluation. + +Inspired by Anthropic's skill-creator run_eval.py (Apache 2.0). + +Usage: + uv run python .test/scripts/quick_trigger.py [options] + +Options: + --prompts Comma-separated list of test prompts (overrides auto-discovery) + --negative Comma-separated list of prompts that should NOT trigger the skill + --verbose Show full claude output + --timeout Timeout per prompt in seconds (default: 30) +""" + +import argparse +import json +import re +import subprocess +import sys +import time +from pathlib import Path + +from _common import find_repo_root, setup_path + +repo_root = setup_path() + +SKILLS_DIR = repo_root / "databricks-skills" +ROUTING_TESTS = repo_root / ".test" / "skills" / "_routing" / "ground_truth.yaml" + + +def load_skill_description(skill_name: str) -> str | None: + """Load description from a skill's SKILL.md frontmatter.""" + skill_md = SKILLS_DIR / skill_name / "SKILL.md" + if not skill_md.exists(): + return None + content = skill_md.read_text() + match = re.match(r"^---\n(.+?)\n---", content, re.DOTALL) + if match: + import yaml + + fm = yaml.safe_load(match.group(1)) + return fm.get("description", "") + return None + + +def load_routing_tests(skill_name: str) -> tuple[list[str], list[str]]: + """Load routing test prompts for this skill from ground_truth.yaml. + + Returns (positive_prompts, negative_prompts). + """ + positive = [] + negative = [] + + if not ROUTING_TESTS.exists(): + return positive, negative + + import yaml + + data = yaml.safe_load(ROUTING_TESTS.read_text()) + for tc in data.get("test_cases", []): + expected = tc.get("expectations", {}).get("expected_skills", []) + prompt = tc.get("inputs", {}).get("prompt", "") + if not prompt: + continue + if skill_name in expected: + positive.append(prompt) + elif not expected and skill_name in tc.get("id", ""): + negative.append(prompt) + + return positive, negative + + +def run_prompt(prompt: str, timeout: int = 30) -> tuple[str, float]: + """Run a prompt through claude -p and return (output, elapsed_seconds).""" + start = time.time() + try: + result = subprocess.run( + ["claude", "-p", prompt, "--output-format", "text"], + capture_output=True, + text=True, + timeout=timeout, + ) + elapsed = time.time() - start + return result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - start + return f"TIMEOUT after {timeout}s", elapsed + except FileNotFoundError: + return "ERROR: claude CLI not found. Install Claude Code first.", 0.0 + + +def check_skill_triggered(output: str, skill_name: str) -> bool: + """Check if the skill name appears in the output, suggesting it triggered.""" + # Look for skill name in output (case-insensitive, hyphen/underscore flexible) + pattern = skill_name.replace("-", "[-_\\s]?") + return bool(re.search(pattern, output, re.IGNORECASE)) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Quick trigger validation for skills") + parser.add_argument("skill_name", help="Name of the skill to test") + parser.add_argument("--prompts", help="Comma-separated positive test prompts") + parser.add_argument("--negative", help="Comma-separated negative test prompts") + parser.add_argument("--verbose", action="store_true", help="Show full output") + parser.add_argument( + "--timeout", type=int, default=30, help="Timeout per prompt (seconds)" + ) + args = parser.parse_args() + + skill_name = args.skill_name + + # Validate skill exists + desc = load_skill_description(skill_name) + if desc is None: + print(f"ERROR: Skill '{skill_name}' not found in {SKILLS_DIR}") + return 1 + + print(f"Quick Trigger Test: {skill_name}") + print(f"Description: {desc[:100]}...") + print() + + # Gather prompts + if args.prompts: + positive_prompts = [p.strip() for p in args.prompts.split(",")] + else: + positive_prompts, _ = load_routing_tests(skill_name) + + if args.negative: + negative_prompts = [p.strip() for p in args.negative.split(",")] + else: + _, negative_prompts = load_routing_tests(skill_name) + + if not positive_prompts: + print( + "WARNING: No test prompts found. Add routing tests to " + ".test/skills/_routing/ground_truth.yaml or use --prompts" + ) + print( + "Example: uv run python .test/scripts/quick_trigger.py " + f'{skill_name} --prompts "Create a vector search index"' + ) + return 1 + + # Run positive tests (should trigger) + results = {"positive": [], "negative": [], "summary": {}} + pass_count = 0 + total = len(positive_prompts) + + print(f"=== Should Trigger ({total} prompts) ===\n") + for i, prompt in enumerate(positive_prompts, 1): + output, elapsed = run_prompt(prompt, args.timeout) + triggered = check_skill_triggered(output, skill_name) + status = "PASS" if triggered else "FAIL" + if triggered: + pass_count += 1 + + print(f" [{status}] ({elapsed:.1f}s) {prompt[:80]}") + if args.verbose: + print(f" Output: {output[:200]}") + + results["positive"].append( + { + "prompt": prompt, + "triggered": triggered, + "elapsed_seconds": round(elapsed, 2), + } + ) + + # Run negative tests (should NOT trigger) + neg_pass_count = 0 + neg_total = len(negative_prompts) + + if negative_prompts: + print(f"\n=== Should NOT Trigger ({neg_total} prompts) ===\n") + for prompt in negative_prompts: + output, elapsed = run_prompt(prompt, args.timeout) + triggered = check_skill_triggered(output, skill_name) + status = "PASS" if not triggered else "FAIL" + if not triggered: + neg_pass_count += 1 + + print(f" [{status}] ({elapsed:.1f}s) {prompt[:80]}") + if args.verbose: + print(f" Output: {output[:200]}") + + results["negative"].append( + { + "prompt": prompt, + "triggered": triggered, + "elapsed_seconds": round(elapsed, 2), + } + ) + + # Summary + print(f"\n=== Summary ===") + print(f" Positive (should trigger): {pass_count}/{total}") + if negative_prompts: + print(f" Negative (should NOT trigger): {neg_pass_count}/{neg_total}") + + all_passed = pass_count == total and neg_pass_count == neg_total + results["summary"] = { + "skill_name": skill_name, + "positive_pass_rate": pass_count / total if total > 0 else 0, + "negative_pass_rate": neg_pass_count / neg_total if neg_total > 0 else 0, + "all_passed": all_passed, + } + + # Write results to JSON + output_path = ( + repo_root / ".test" / "skills" / skill_name / "quick_trigger_results.json" + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(results, indent=2)) + print(f"\n Results saved to: {output_path}") + + return 0 if all_passed else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d5b120fd..a38005c1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,15 +90,95 @@ Ensure your changes work with a live Databricks workspace. ## Adding New Skills -When adding a new skill to `databricks-skills/`: - -1. Create a directory with a descriptive name -2. Include a `SKILL.md` file that defines: - - Trigger conditions (when the skill activates) - - Core patterns and best practices - - Code examples -3. Add supporting documentation files as needed -4. Update the skills table in the main README.md +### Recommended: Use the Authoring Skill + +The fastest way to create a high-quality skill is with the `skill-authoring` skill (available in `.skill-authoring/` when you clone the repo). Ask Claude: + +> "Help me create a new skill for [Databricks feature]" + +This walks you through a structured workflow: interview, draft, test, validate, register. + +### Manual Workflow + +If you prefer to work manually: + +1. **Copy the template:** + ```bash + cp -r databricks-skills/TEMPLATE databricks-skills/your-skill-name + ``` + +2. **Write SKILL.md** with valid frontmatter and content: + ```yaml + --- + name: your-skill-name + description: "What it does. Use when [scenario1], [scenario2], or when the user mentions [keywords]." + --- + ``` + +3. **Generate test scaffolding:** + ```bash + /skill-test your-skill-name init + ``` + +4. **Write test cases** in `.test/skills/your-skill-name/ground_truth.yaml` (minimum 3 cases) + +5. **Add routing tests** to `.test/skills/_routing/ground_truth.yaml` (minimum 3 positive, 2 negative) + +6. **Register the skill:** + - Add to `DATABRICKS_SKILLS` in `databricks-skills/install_skills.sh` + - Add to appropriate profile in `install.sh` and `install.ps1` + - Add to skills table in `databricks-skills/README.md` + +7. **Validate:** + ```bash + python .github/scripts/validate_skills.py + ``` + +### Quality Checklist + +Before submitting a PR for a new skill, verify: + +- [ ] **Frontmatter**: `name` (kebab-case, <=64 chars) and `description` (<=1024 chars, includes "Use when" triggers) +- [ ] **Description is assertive**: Uses "Use when" with specific triggers and domain keywords +- [ ] **Body under 500 lines**: Reference files used for overflow content +- [ ] **Code blocks have language tags**: ```python, ```sql, ```yaml, ```bash +- [ ] **Code examples are current**: No deprecated APIs (`@dlt.table`, `PARTITION BY`, `mlflow.evaluate`) +- [ ] **Quick Start is complete**: Copy-pasteable with imports and realistic values +- [ ] **Common Issues documented**: Known gotchas with solutions +- [ ] **Test scaffolding exists**: `.test/skills/your-skill-name/` with ground_truth.yaml and manifest.yaml +- [ ] **Routing tests added**: At least 5 entries (3 positive, 2 negative) in `_routing/ground_truth.yaml` +- [ ] **CI validation passes**: `python .github/scripts/validate_skills.py` +- [ ] **Registered in install scripts**: `install_skills.sh`, `install.sh`, `install.ps1` +- [ ] **README updated**: Skill added to `databricks-skills/README.md` + +### Skill Format Reference + +For detailed format specifications, see: +- `.skill-authoring/references/skill-format.md` — Frontmatter rules, progressive disclosure, section conventions +- `.skill-authoring/references/test-format.md` — ground_truth.yaml and manifest.yaml schemas + +### Evaluation & Optimization + +After creating a skill, measure and improve quality: + +```bash +# Quick trigger smoke test +uv run python .test/scripts/quick_trigger.py your-skill-name + +# Evaluation against ground truth +uv run python .test/scripts/run_eval.py your-skill-name + +# Full MLflow evaluation with LLM judges +uv run python .test/scripts/mlflow_eval.py your-skill-name + +# Description optimization (GEPA framework) +uv run python .test/scripts/optimize.py your-skill-name --preset quick +``` + +Quality gates to meet: +- **Tier 1** (syntax/patterns): 100% pass rate +- **Tier 2** (execution): 80% pass rate +- **Tier 3** (LLM judge): 85% pass rate ## Updating Existing Skills diff --git a/databricks-skills/README.md b/databricks-skills/README.md index ae4886fc..e889e92a 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -110,6 +110,37 @@ description: "What this teaches" ... ``` +## Developing Skills + +For contributors creating new skills, ai-dev-kit provides a guided authoring workflow and evaluation framework. + +### Quick Start (Contributors) + +1. **Use the authoring skill** — Clone the repo and ask Claude: "Help me create a new skill for [feature]". The `skill-authoring` skill (in `.skill-authoring/`) will guide you through the full workflow: interview, draft, test, validate, register. + +2. **Or start manually** — Copy the template and fill in the sections: + ```bash + cp -r databricks-skills/TEMPLATE databricks-skills/your-skill-name + # Edit SKILL.md with your content + ``` + +3. **Generate test scaffolding**: + ```bash + /skill-test your-skill-name init + ``` + +4. **Run quick trigger validation**: + ```bash + uv run python .test/scripts/quick_trigger.py your-skill-name + ``` + +5. **Run full evaluation**: + ```bash + uv run python .test/scripts/run_eval.py your-skill-name + ``` + +See [CONTRIBUTING.md](../CONTRIBUTING.md) for the full contributor guide and quality checklist. + ## Troubleshooting **Skills not loading?** Check `.claude/skills/` exists and each skill has `SKILL.md` diff --git a/databricks-skills/TEMPLATE/SKILL.md b/databricks-skills/TEMPLATE/SKILL.md index c3764c43..8e3cbb39 100644 --- a/databricks-skills/TEMPLATE/SKILL.md +++ b/databricks-skills/TEMPLATE/SKILL.md @@ -1,24 +1,43 @@ --- -name: template -description: "A brief one-sentence description of what this skill helps with." +name: your-skill-name +description: "Brief description of what this skill does. Use when [specific scenario 1], [specific scenario 2], or when the user mentions [keyword1], [keyword2], [keyword3]." --- -# Skill Name +# Skill Title + +One-paragraph summary of what this skill covers and why it exists. + +## When to Use + +Use this skill when: +- Building or configuring [specific feature] +- Working with [specific API or tool] +- The user mentions [domain keywords] ## Overview -A short paragraph explaining what this skill does and when to use it. +Brief conceptual summary. Tables work well for comparing options: + +| Component | Description | When to Use | +|-----------|-------------|-------------| +| **Option A** | What it does | Best for X | +| **Option B** | What it does | Best for Y | ## Quick Start -Simple example showing the most common use case: +The simplest, most common use case. Must be complete and copy-pasteable: ```python -# Example code or command -example_function( - parameter1="value1", - parameter2="value2" +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() + +# Example: the most common operation for this skill +result = w.some_api.create( + name="my-resource", + config={"key": "value"} ) +print(f"Created: {result.name}") ``` ## Common Patterns @@ -26,29 +45,35 @@ example_function( ### Pattern 1: Basic Usage ```python -# Simple example -basic_example() +# Description of what this pattern does +w.some_api.basic_operation( + param1="value1", + param2="value2" +) ``` -### Pattern 2: Advanced Usage +### Pattern 2: Advanced Configuration ```python -# More complex example -advanced_example( - option1=True, - option2="custom" +# When you need more control over behavior +w.some_api.advanced_operation( + param1="value1", + advanced_config={ + "setting1": True, + "setting2": "custom" + } ) ``` ## Reference Files -Link to supporting documentation files if needed: -- [example_file1.md](example_file1.md) - Description of what this covers -- [example_file2.md](example_file2.md) - Description of what this covers +Link to supporting documentation if SKILL.md would exceed ~400 lines: +- [detailed-api-reference.md](detailed-api-reference.md) - Exhaustive API parameters and options +- [migration-guide.md](migration-guide.md) - Migrating from deprecated patterns ## Common Issues | Issue | Solution | |-------|----------| -| **Problem description** | How to fix it | -| **Another problem** | Another solution | +| **`PERMISSION_DENIED` error** | Grant required permissions via Unity Catalog | +| **Resource not found** | Verify the resource exists in the correct catalog/schema | diff --git a/databricks-skills/TEMPLATE/example_file1.md b/databricks-skills/TEMPLATE/example_file1.md index 2efc64dc..b14debbd 100644 --- a/databricks-skills/TEMPLATE/example_file1.md +++ b/databricks-skills/TEMPLATE/example_file1.md @@ -1,41 +1,65 @@ -# Example Pattern 1 +# Detailed API Reference + +Reference file for deep content that doesn't fit in SKILL.md (<500 lines target). ## When to Use -Use this pattern when you need to accomplish X. +Use this reference when you need exhaustive API details, full parameter lists, or advanced configuration options that go beyond the common patterns in SKILL.md. + +## API Methods -## Code Example +### method_name() + +Creates or configures a resource. ```python -def simple_example(): - """ - A simple example showing the basic pattern. - """ - result = do_something( - input="value", - options={"setting": True} - ) - return result +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() + +result = w.some_api.method_name( + name="my-resource", # Required: resource name + description="What it does", # Optional: human-readable description + config={ + "setting1": True, # Default: False + "setting2": "value", # Options: "value", "other" + "advanced_option": 42 # Only needed for specific use cases + } +) ``` -## Explanation +**Parameters:** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `name` | str | Yes | — | Resource name (lowercase, hyphens allowed) | +| `description` | str | No | `""` | Human-readable description | +| `config` | dict | No | `{}` | Configuration options | -- **Step 1**: First do this -- **Step 2**: Then do this -- **Step 3**: Finally do this +**Returns:** `ResourceInfo` object with `.name`, `.id`, `.status` attributes. ## Common Variations -### Variation A +### With Custom Authentication ```python -# Alternative approach -alternative_method() +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient(profile="my-profile") +result = w.some_api.method_name(name="my-resource") ``` -### Variation B +### With Error Handling ```python -# Another way to do it -another_method() +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound, PermissionDenied + +w = WorkspaceClient() +try: + result = w.some_api.method_name(name="my-resource") +except NotFound: + print("Resource not found — check catalog and schema") +except PermissionDenied: + print("Insufficient permissions — grant via Unity Catalog") ``` diff --git a/databricks-skills/TEMPLATE/example_file2.md b/databricks-skills/TEMPLATE/example_file2.md index 18172ecf..6e91ebc8 100644 --- a/databricks-skills/TEMPLATE/example_file2.md +++ b/databricks-skills/TEMPLATE/example_file2.md @@ -1,40 +1,54 @@ -# Example Pattern 2 +# Migration Guide + +Reference file for migration patterns from deprecated APIs or older approaches. ## When to Use -Use this pattern when you need to accomplish Y. +Use this reference when migrating from legacy patterns or when Claude detects deprecated API usage in existing code. + +## Deprecated Patterns + +### Old Pattern → New Pattern -## Code Example +**Before (deprecated):** +```python +# Do NOT use — this API was deprecated in version X.Y +old_api.legacy_method(param="value") +``` +**After (current):** ```python -def another_example(): - """ - Another example showing a different pattern. - """ - # Configure settings - config = { - "option1": "value1", - "option2": "value2" - } - - # Execute - result = execute_task(config) - - # Handle result - if result.success: - print("Success!") - else: - print(f"Error: {result.error}") +# Use this instead +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() +w.new_api.current_method(param="value") ``` -## Best Practices +**Why it changed:** Brief explanation of why the API was updated and what benefits the new approach provides. + +### Another Migration -- Keep it simple -- Use clear variable names -- Handle errors appropriately +**Before:** +```sql +-- Deprecated: PARTITION BY is no longer recommended +CREATE TABLE my_table +PARTITION BY (date_col) +AS SELECT * FROM source; +``` + +**After:** +```sql +-- Use CLUSTER BY for better performance with liquid clustering +CREATE TABLE my_table +CLUSTER BY (date_col) +AS SELECT * FROM source; +``` -## Tips +## Migration Checklist -- Tip 1: A helpful hint -- Tip 2: Another useful suggestion -- Tip 3: One more thing to remember +| Old Pattern | New Pattern | Notes | +|-------------|-------------|-------| +| `old_function()` | `new_function()` | Direct replacement | +| `@dlt.table` | `@dp.table` | Requires import change | +| `PARTITION BY` | `CLUSTER BY` | Liquid clustering, better performance | diff --git a/databricks-skills/databricks-ai-functions/SKILL.md b/databricks-skills/databricks-ai-functions/SKILL.md index e3fc3fbb..b57f0767 100644 --- a/databricks-skills/databricks-ai-functions/SKILL.md +++ b/databricks-skills/databricks-ai-functions/SKILL.md @@ -1,6 +1,6 @@ --- name: databricks-ai-functions -description: "Use Databricks built-in AI Functions (ai_classify, ai_extract, ai_summarize, ai_mask, ai_translate, ai_fix_grammar, ai_gen, ai_analyze_sentiment, ai_similarity, ai_parse_document, ai_query, ai_forecast) to add AI capabilities directly to SQL and PySpark pipelines without managing model endpoints. Also covers document parsing and building custom RAG pipelines (parse → chunk → index → query)." +description: "Use Databricks built-in AI Functions (ai_classify, ai_extract, ai_summarize, ai_mask, ai_translate, ai_fix_grammar, ai_gen, ai_analyze_sentiment, ai_similarity, ai_parse_document, ai_query, ai_forecast) to add AI capabilities directly to SQL and PySpark pipelines without managing model endpoints. Also covers document parsing and building custom RAG pipelines (parse → chunk → index → query). Use when classifying text, extracting entities, summarizing content, redacting PII, translating languages, scoring sentiment, comparing text similarity, parsing documents, forecasting time series, or adding any AI capability to a SQL or PySpark pipeline." --- # Databricks AI Functions diff --git a/databricks-skills/databricks-dbsql/SKILL.md b/databricks-skills/databricks-dbsql/SKILL.md index 24bf2694..6e0e2f1c 100644 --- a/databricks-skills/databricks-dbsql/SKILL.md +++ b/databricks-skills/databricks-dbsql/SKILL.md @@ -2,6 +2,7 @@ name: databricks-dbsql description: >- Databricks SQL (DBSQL) advanced features and SQL warehouse capabilities. + Use when writing SQL queries on Databricks, configuring SQL warehouses, or working with advanced SQL features. This skill MUST be invoked when the user mentions: "DBSQL", "Databricks SQL", "SQL warehouse", "SQL scripting", "stored procedure", "CALL procedure", "materialized view", "CREATE MATERIALIZED VIEW", "pipe syntax", "|>",