From cd514f54a5275fcfe6484e9ce44abbc29bcfbfe0 Mon Sep 17 00:00:00 2001
From: praneeth_paikray-data <praneeth.paikray@databricks.com>
Date: Tue, 24 Mar 2026 20:31:39 +0530
Subject: [PATCH] Add skill authoring workflow and CI quality checks (#338,
 #330)

Evaluated Anthropic's skill-creator and cherry-picked its conversational
authoring workflow for ai-dev-kit contributors. The existing .test/
framework already outperforms skill-creator on evaluation and optimization,
so we focused on the one gap: a guided skill creation experience.

What's included:
- Contributor authoring skill (.skill-authoring/) with 6-phase workflow
  adapted from Anthropic's skill-creator (Apache 2.0, attributed)
- Reference docs for skill format and test format specifications
- Quick trigger validation script (.test/scripts/quick_trigger.py)
- CI quality warnings in validate_skills.py (non-blocking): missing
  "Use when" triggers, body over 500 lines, untagged code blocks,
  broken reference file links
- Upgraded TEMPLATE from 55-line placeholder to realistic template
- Expanded CONTRIBUTING.md with quality checklist and full workflow
- Added "Developing Skills" section to databricks-skills/README.md
- Fixed "Use when" triggers for databricks-ai-functions and databricks-dbsql

Co-authored-by: Isaac
---
 .github/scripts/validate_skills.py            |  76 +++-
 .skill-authoring/SKILL.md                     | 414 ++++++++++++++++++
 .skill-authoring/install_skill_authoring.sh   |  25 ++
 .skill-authoring/references/skill-format.md   | 156 +++++++
 .skill-authoring/references/test-format.md    | 263 +++++++++++
 .test/SKILL.md                                |   2 +
 .test/scripts/quick_trigger.py                | 226 ++++++++++
 CONTRIBUTING.md                               |  98 ++++-
 databricks-skills/README.md                   |  31 ++
 databricks-skills/TEMPLATE/SKILL.md           |  67 ++-
 databricks-skills/TEMPLATE/example_file1.md   |  68 ++-
 databricks-skills/TEMPLATE/example_file2.md   |  72 +--
 .../databricks-ai-functions/SKILL.md          |   2 +-
 databricks-skills/databricks-dbsql/SKILL.md   |   1 +
 14 files changed, 1418 insertions(+), 83 deletions(-)
 create mode 100644 .skill-authoring/SKILL.md
 create mode 100755 .skill-authoring/install_skill_authoring.sh
 create mode 100644 .skill-authoring/references/skill-format.md
 create mode 100644 .skill-authoring/references/test-format.md
 create mode 100644 .test/scripts/quick_trigger.py

diff --git a/.github/scripts/validate_skills.py b/.github/scripts/validate_skills.py
index e3bc38e4..29af99da 100644
--- a/.github/scripts/validate_skills.py
+++ b/.github/scripts/validate_skills.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """Validate skill structure and frontmatter.
 
-Checks:
+Checks (errors — block PRs):
 1. Every skill directory has a SKILL.md file
 2. SKILL.md has valid YAML frontmatter per best practices:
    - name: required, ≤64 chars, lowercase letters/numbers/hyphens only,
@@ -9,6 +9,12 @@
    - description: required, non-empty, ≤1024 chars, no XML tags
 3. Local skill directories are registered in install_skills.sh
    (skill-list variables are auto-discovered, not hardcoded)
+
+Quality warnings (non-blocking):
+4. Description should contain "Use when" trigger phrases
+5. SKILL.md body should be under 500 lines (use reference files for overflow)
+6. Code blocks should have language tags
+7. Referenced files (markdown links) should exist
 """
 
 import re
@@ -26,6 +32,62 @@
 XML_TAG_RE = re.compile(r"<[^>]+>")
 
 
+CODE_BLOCK_RE = re.compile(r"^```(\w*)$", re.MULTILINE)
+MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
+MAX_BODY_LINES = 500
+
+
+def quality_warnings(skill_dir: Path, content: str, frontmatter: dict) -> list[str]:
+    """Run non-blocking quality checks and return warnings."""
+    warnings = []
+
+    # Check description contains "Use when" trigger phrases
+    desc = str(frontmatter.get("description", ""))
+    if desc and "use when" not in desc.lower():
+        warnings.append(
+            f'{skill_dir.name}: description lacks "Use when" trigger phrases '
+            f"(helps Claude decide when to activate the skill)"
+        )
+
+    # Check body length (lines below frontmatter)
+    body = re.sub(r"^---\n.+?\n---\n?", "", content, count=1, flags=re.DOTALL)
+    body_lines = len(body.strip().splitlines())
+    if body_lines > MAX_BODY_LINES:
+        warnings.append(
+            f"{skill_dir.name}: SKILL.md body is {body_lines} lines "
+            f"(>{MAX_BODY_LINES}). Consider moving content to reference files."
+        )
+
+    # Check code blocks have language tags
+    # Every pair of ``` markers forms a block; even-indexed matches (0,2,4..)
+    # are opening markers, odd-indexed are closing markers.
+    all_fences = list(CODE_BLOCK_RE.finditer(content))
+    opening_fences = [all_fences[i] for i in range(0, len(all_fences), 2)]
+    untagged = sum(1 for m in opening_fences if not m.group(1))
+    if untagged > 0:
+        warnings.append(
+            f"{skill_dir.name}: {untagged} code block(s) missing language tags "
+            f"(use ```python, ```sql, ```yaml, etc.)"
+        )
+
+    # Check referenced markdown files exist
+    for match in MD_LINK_RE.finditer(content):
+        link_target = match.group(2)
+        # Only check relative .md links (not URLs, not anchors)
+        if (
+            not link_target.startswith("http")
+            and not link_target.startswith("#")
+            and link_target.endswith(".md")
+        ):
+            ref_path = skill_dir / link_target
+            if not ref_path.exists():
+                warnings.append(
+                    f"{skill_dir.name}: referenced file '{link_target}' not found"
+                )
+
+    return warnings
+
+
 def parse_frontmatter(content: str) -> dict | None:
     """Extract YAML frontmatter from markdown content."""
     match = re.match(r"^---\n(.+?)\n---", content, re.DOTALL)
@@ -117,6 +179,7 @@ def get_local_skill_dirs() -> set[str]:
 
 def main() -> int:
     errors: list[str] = []
+    warnings: list[str] = []
     actual_skills = get_local_skill_dirs()
 
     # --- Validate each skill directory's SKILL.md and frontmatter ---
@@ -153,6 +216,10 @@ def main() -> int:
             for err in validate_description(str(frontmatter["description"])):
                 errors.append(f"{skill_dir.name}: {err}")
 
+        # Quality warnings (non-blocking)
+        for warn in quality_warnings(skill_dir, content, frontmatter):
+            warnings.append(warn)
+
     # --- Cross-reference with install_skills.sh ---
     install_content = INSTALL_SCRIPT.read_text()
     skill_vars, composite_vars = parse_skill_variables(install_content)
@@ -182,6 +249,13 @@ def main() -> int:
                 errors.append(f"Skills in {var_name} but no directory found: {sorted(missing)}")
 
     # --- Report ---
+    # Surface warnings (non-blocking) before errors
+    if warnings:
+        print(f"Quality warnings ({len(warnings)}):\n")
+        for warning in warnings:
+            print(f"::warning::{warning}")
+        print()
+
     if errors:
         print("Skill validation failed:\n")
         for error in errors:
diff --git a/.skill-authoring/SKILL.md b/.skill-authoring/SKILL.md
new file mode 100644
index 00000000..be298b25
--- /dev/null
+++ b/.skill-authoring/SKILL.md
@@ -0,0 +1,414 @@
+---
+name: skill-authoring
+description: "Guided workflow for creating new Databricks skills for ai-dev-kit. Use when a contributor wants to create a new skill, draft a SKILL.md, generate test cases, or improve an existing skill's structure. Triggers on 'create skill', 'new skill', 'author skill', 'draft skill', 'skill template', or 'write a skill for'."
+---
+
+# Databricks Skill Authoring Guide
+
+Conversational workflow for creating high-quality Databricks skills in ai-dev-kit. Adapted from [Anthropic's skill-creator](https://github.com/anthropics/skills/tree/main/skills/skill-creator) (Apache 2.0) for ai-dev-kit conventions.
+
+## References
+
+- [Skill Format Specification](references/skill-format.md) — Frontmatter rules, progressive disclosure, section conventions
+- [Test Format Specification](references/test-format.md) — ground_truth.yaml and manifest.yaml schemas
+
+## Workflow Overview
+
+Follow these phases in order. Do not skip phases — each builds on the previous.
+
+```
+Phase 1: Capture Intent ──► Phase 2: Interview & Research ──► Phase 3: Write SKILL.md
+    │                                                              │
+    │                                                              ▼
+    │                                                     Phase 4: Write Test Cases
+    │                                                              │
+    │                                                              ▼
+    │                                                     Phase 5: Validate & Register
+    │                                                              │
+    │                                                              ▼
+    └──────────────────────────────────────────────────── Phase 6: Evaluate & Improve
+```
+
+---
+
+## Phase 1: Capture Intent
+
+Ask the contributor these questions before writing anything:
+
+1. **What Databricks feature or domain does this skill cover?**
+   - e.g., "Unity Catalog metric views", "Lakebase autoscaling", "Model Serving endpoints"
+
+2. **Who is the target user?**
+   - Data engineers, data scientists, ML engineers, analysts, app developers?
+
+3. **What are 3-5 specific tasks a user would ask Claude to do with this skill?**
+   - These become the seed for test cases later.
+
+4. **Does this overlap with an existing skill?**
+   - Check `databricks-skills/` for related skills. If overlap exists, clarify the boundary.
+
+5. **What language(s) should code examples use?**
+   - Python, SQL, YAML (for bundles), or a mix?
+
+**Output**: A short summary paragraph capturing the skill's purpose, audience, and scope. Save this — it becomes the seed for the description.
+
+---
+
+## Phase 2: Interview & Research
+
+Dig deeper into the domain. Ask the contributor (or research from Databricks docs):
+
+### Domain Questions
+- What are the **critical rules** a user must always follow? (e.g., "always use serverless", "never use deprecated DLT syntax")
+- What are the **common mistakes** or deprecated patterns to avoid?
+- What **API versions or SDK methods** are relevant? Verify they exist in the current Databricks SDK.
+- Are there **multiple approaches** to the same task? (e.g., Python SDK vs. SQL vs. CLI) — document the recommended one first.
+
+### Scope Questions
+- What should this skill **NOT** cover? (Explicit exclusions prevent scope creep)
+- Are there **prerequisite skills** the user should know about? (e.g., databricks-unity-catalog for permissions)
+- Should the skill reference any **MCP tools** from `databricks-mcp-server/`? If so, verify the tool names exist as `@mcp.tool` functions.
+
+### Structure Questions
+- Is the content **small enough for a single SKILL.md** (<500 lines)?
+- If not, what content should go into **reference files**? (Deep API references, exhaustive parameter lists, migration guides)
+
+**Output**: A structured outline of sections and reference files.
+
+---
+
+## Phase 3: Write SKILL.md
+
+Create the skill directory and SKILL.md following ai-dev-kit conventions.
+
+### 3.1 Directory Structure
+
+```
+databricks-skills/{skill-name}/
+├── SKILL.md                    # Required — main skill file
+├── {reference1}.md             # Optional — deep reference content
+├── {reference2}.md             # Optional — additional patterns
+└── ...
+```
+
+The directory name MUST match the `name` field in frontmatter.
+
+### 3.2 Frontmatter
+
+```yaml
+---
+name: {skill-name}
+description: "{One paragraph, max 1024 chars. Must include 'Use when' trigger phrases. Be specific and pushy — Claude tends to under-trigger skills, so make the description assertive about when to activate.}"
+---
+```
+
+**Frontmatter rules** (enforced by CI):
+- `name`: Required. Lowercase letters, numbers, hyphens only. Max 64 chars. Must not contain "anthropic" or "claude".
+- `description`: Required. Non-empty. Max 1024 chars. No XML tags. Must contain "Use when" with specific trigger scenarios.
+
+**Writing effective descriptions:**
+- Lead with what the skill does: "Creates, configures, and manages X..."
+- Include explicit trigger phrases: "Use when building X, working with Y, or when the user mentions Z."
+- List specific keywords that should activate the skill
+- Be assertive — "Use when" not "Can be used when"
+- Example from a good skill:
+  ```
+  "Patterns for Databricks Vector Search: create endpoints and indexes, query with filters, manage embeddings. Use when building RAG applications, semantic search, or similarity matching. Covers both storage-optimized and standard endpoints."
+  ```
+
+### 3.3 Progressive Disclosure Architecture
+
+Structure content in three tiers:
+
+| Tier | What | When Loaded | Budget |
+|------|------|-------------|--------|
+| **Metadata** | name + description in frontmatter | Always in context | ~100 words |
+| **Body** | SKILL.md content below frontmatter | When skill triggers | <500 lines ideal |
+| **References** | Separate .md files | When Claude reads them | Unlimited |
+
+**Key principle**: Keep SKILL.md lean. Move deep reference material (exhaustive API docs, parameter lists, migration guides) into reference files. The body should contain enough to handle 80% of requests; reference files cover the remaining 20%.
+
+### 3.4 Recommended Body Sections
+
+Based on patterns from the best existing skills:
+
+```markdown
+# {Skill Title}
+
+{One-paragraph summary + critical rules if any}
+
+## When to Use
+
+Use this skill when:
+- {Specific scenario 1}
+- {Specific scenario 2}
+- {Specific scenario 3}
+
+## Overview
+
+{Component table or conceptual summary — help the user understand the landscape before diving into code}
+
+## Quick Start
+
+{The simplest, most common use case. Working code example the user can copy.}
+
+## Common Patterns
+
+### Pattern 1: {Descriptive Name}
+{Code example + brief explanation}
+
+### Pattern 2: {Descriptive Name}
+{Code example + brief explanation}
+
+## Reference Files
+
+- [{reference1}.md]({reference1}.md) - {What this covers}
+- [{reference2}.md]({reference2}.md) - {What this covers}
+
+## Common Issues
+
+| Issue | Solution |
+|-------|----------|
+| **{Problem}** | {Fix} |
+| **{Problem}** | {Fix} |
+```
+
+**Section guidelines:**
+- **Critical Rules**: If the domain has rules that MUST always be followed (like "always use serverless"), put them right after the title, before any other section. Use bold and imperative language.
+- **When to Use**: Reinforces the description triggers. Helps Claude decide if the skill applies.
+- **Overview**: Tables work well for comparing options (endpoint types, index types, SDK methods).
+- **Quick Start**: Must be a complete, working example — not pseudocode. This is the most important section.
+- **Common Patterns**: 2-5 patterns covering the most frequent use cases. Each should have a real code example.
+- **Reference Files**: Only include if SKILL.md would exceed ~400 lines without them.
+- **Common Issues**: Known gotchas, error messages, and their fixes.
+
+### 3.5 Reference File Conventions
+
+Reference files follow the same markdown format without frontmatter:
+
+```markdown
+# {Pattern/Topic Name}
+
+## When to Use
+{Specific scenario this reference covers}
+
+## Code Example
+{Working code with explanations}
+
+## Explanation
+{Why this approach, tradeoffs, alternatives}
+
+## Common Variations
+{Modifications for different scenarios}
+```
+
+### 3.6 Code Example Standards
+
+- All code blocks MUST have a language tag (```python, ```sql, ```yaml, ```bash)
+- Use realistic values, not "foo/bar" placeholders
+- Include imports and setup — examples should be copy-pasteable
+- Use current APIs only — no deprecated patterns (e.g., `@dp.table` not `@dlt.table`)
+- Prefer the Databricks SDK (`from databricks.sdk import WorkspaceClient`) for Python examples
+
+---
+
+## Phase 4: Write Test Cases
+
+Generate test scaffolding using the existing test framework.
+
+### 4.1 Initialize Test Scaffolding
+
+Run:
+```bash
+uv run python .test/scripts/init_skill.py {skill-name}
+```
+
+Or use the `/skill-test` command:
+```
+/skill-test {skill-name} init
+```
+
+This creates the test directory at `.test/skills/{skill-name}/` with template files.
+
+### 4.2 Write Ground Truth
+
+Create `.test/skills/{skill-name}/ground_truth.yaml` with 3-5 test cases. Use the tasks from Phase 1 as seeds.
+
+See [Test Format Specification](references/test-format.md) for the full schema.
+
+**Test case guidelines:**
+- Include at least one "happy path" (straightforward use case)
+- Include at least one "edge case" (unusual input, boundary condition)
+- Include at least one that tests the skill's boundaries (what it should NOT do)
+- Use realistic prompts — write them as a user would naturally phrase the request
+- Expected responses should be complete and correct — they become the reference standard
+
+### 4.3 Write Routing Tests
+
+Add entries to `.test/skills/_routing/ground_truth.yaml`:
+
+```yaml
+# Should trigger
+- id: "routing_{skill-name}_001"
+  inputs:
+    prompt: "{A prompt that should trigger this skill}"
+  expectations:
+    expected_skills: ["{skill-name}"]
+    is_multi_skill: false
+  metadata:
+    category: "single_skill"
+    difficulty: "easy"
+    reasoning: "{Why this should trigger the skill}"
+
+# Should NOT trigger
+- id: "routing_{skill-name}_neg_001"
+  inputs:
+    prompt: "{A prompt that sounds related but should NOT trigger}"
+  expectations:
+    expected_skills: []
+    is_multi_skill: false
+  metadata:
+    category: "no_match"
+    difficulty: "medium"
+    reasoning: "{Why this should NOT trigger despite seeming related}"
+```
+
+Include at minimum:
+- 3 should-trigger prompts (easy, medium, hard)
+- 2 should-NOT-trigger prompts (plausible false positives)
+
+### 4.4 Configure Manifest
+
+Edit `.test/skills/{skill-name}/manifest.yaml`:
+
+```yaml
+skill:
+  name: "{skill-name}"
+  source_path: "databricks-skills/{skill-name}"
+  description: "{Short description}"
+
+evaluation:
+  datasets:
+    - path: ground_truth.yaml
+      type: yaml
+
+  scorers:
+    tier1:
+      - python_syntax      # If skill produces Python
+      - sql_syntax          # If skill produces SQL
+      - pattern_adherence
+      - no_hallucinated_apis
+    tier2:
+      - code_executes       # If execution can be verified
+    tier3:
+      - Guidelines
+
+  quality_gates:
+    tier1_pass_rate: 1.0
+    tier2_pass_rate: 0.8
+    tier3_pass_rate: 0.85
+```
+
+Remove scorers that don't apply (e.g., drop `sql_syntax` if the skill only produces Python).
+
+---
+
+## Phase 5: Validate & Register
+
+### 5.1 Run CI Validation
+
+```bash
+python .github/scripts/validate_skills.py
+```
+
+This checks:
+- SKILL.md exists with valid frontmatter
+- name/description meet constraints
+- Skill is registered in install_skills.sh
+
+### 5.2 Register in Install Script
+
+Add the skill name to the appropriate variable in `databricks-skills/install_skills.sh`:
+
+1. Add to `DATABRICKS_SKILLS` (the main skill list)
+2. Add to the appropriate profile in `install.sh` and `install.ps1` (data-engineer, analyst, ai-ml-engineer, or app-developer)
+3. If the skill has extra files beyond SKILL.md, add entries to the `get_skill_extra_files()` function
+
+### 5.3 Update README
+
+Add the skill to the skills table in `databricks-skills/README.md`.
+
+---
+
+## Phase 6: Evaluate & Improve
+
+Use the existing test framework to measure and improve quality.
+
+### 6.1 Run Evaluation
+
+```bash
+# Quick evaluation against ground truth
+uv run python .test/scripts/run_eval.py {skill-name}
+
+# Full MLflow evaluation with LLM judges
+uv run python .test/scripts/mlflow_eval.py {skill-name}
+
+# Routing accuracy check
+uv run python .test/scripts/routing_eval.py _routing
+```
+
+### 6.2 Save Baseline
+
+Once results are acceptable:
+```bash
+uv run python .test/scripts/baseline.py {skill-name}
+```
+
+### 6.3 Optimize (Advanced)
+
+For description optimization and skill improvement, use the GEPA framework:
+```bash
+uv run python .test/scripts/optimize.py {skill-name} --preset quick
+```
+
+Presets: `quick` (15 iterations), `standard` (50), `thorough` (150).
+
+### 6.4 Iteration Loop
+
+After evaluation, improve the skill by:
+1. **Reviewing failures**: Read the evaluation output for failed test cases
+2. **Identifying patterns**: Are failures concentrated in one area (e.g., SQL syntax, missing facts)?
+3. **Updating SKILL.md**: Add missing patterns, fix incorrect examples, clarify ambiguous instructions
+4. **Re-running evaluation**: Verify improvements and check for regressions
+
+Repeat until quality gates are met:
+- Tier 1 (syntax/patterns): 100% pass rate
+- Tier 2 (execution): 80% pass rate
+- Tier 3 (LLM judge): 85% pass rate
+
+---
+
+## Checklist
+
+Before submitting a PR, verify:
+
+- [ ] `SKILL.md` has valid frontmatter (name, description with "Use when" triggers)
+- [ ] Description is assertive and includes specific trigger keywords
+- [ ] Body is under 500 lines (reference files used for overflow)
+- [ ] All code blocks have language tags
+- [ ] Code examples use current APIs (no deprecated patterns)
+- [ ] Quick Start example is complete and copy-pasteable
+- [ ] Common Issues section covers known gotchas
+- [ ] Test scaffolding exists at `.test/skills/{skill-name}/`
+- [ ] At least 3 ground truth test cases written
+- [ ] At least 5 routing test cases (3 positive, 2 negative) added
+- [ ] `validate_skills.py` passes
+- [ ] Skill registered in `install_skills.sh`
+- [ ] Skill added to appropriate profile in `install.sh`/`install.ps1`
+- [ ] Skills table updated in `databricks-skills/README.md`
+
+---
+
+## Attribution
+
+This skill's authoring workflow is adapted from [Anthropic's skill-creator](https://github.com/anthropics/skills/tree/main/skills/skill-creator), licensed under Apache License 2.0.
diff --git a/.skill-authoring/install_skill_authoring.sh b/.skill-authoring/install_skill_authoring.sh
new file mode 100755
index 00000000..0b03e384
--- /dev/null
+++ b/.skill-authoring/install_skill_authoring.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Install/sync skill-authoring skill to .claude/skills/
+#
+# The source of truth is .skill-authoring/ (this directory).
+# This script copies files to .claude/skills/skill-authoring/ so Claude Code
+# picks them up automatically when contributors clone the repo.
+#
+# Usage:
+#   bash .skill-authoring/install_skill_authoring.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+TARGET_DIR="$REPO_ROOT/.claude/skills/skill-authoring"
+
+mkdir -p "$TARGET_DIR/references"
+
+echo "Syncing skill-authoring to $TARGET_DIR ..."
+
+cp "$SCRIPT_DIR/SKILL.md" "$TARGET_DIR/SKILL.md"
+cp "$SCRIPT_DIR/references/skill-format.md" "$TARGET_DIR/references/skill-format.md"
+cp "$SCRIPT_DIR/references/test-format.md" "$TARGET_DIR/references/test-format.md"
+
+echo "Done. skill-authoring skill installed at $TARGET_DIR"
diff --git a/.skill-authoring/references/skill-format.md b/.skill-authoring/references/skill-format.md
new file mode 100644
index 00000000..69437b34
--- /dev/null
+++ b/.skill-authoring/references/skill-format.md
@@ -0,0 +1,156 @@
+# Skill Format Specification
+
+Complete reference for ai-dev-kit skill format, frontmatter rules, and structural conventions.
+
+## Directory Layout
+
+```
+databricks-skills/{skill-name}/
+├── SKILL.md                    # Required — main skill instructions
+├── {reference}.md              # Optional — deep reference content
+└── ...                         # Additional reference files as needed
+```
+
+- Directory name must exactly match the `name` field in SKILL.md frontmatter
+- Use lowercase letters, numbers, and hyphens only
+- Prefix with `databricks-` for Databricks platform skills (convention, not enforced)
+
+## Frontmatter Specification
+
+SKILL.md must begin with YAML frontmatter delimited by `---`:
+
+```yaml
+---
+name: "{skill-name}"
+description: "{skill-description}"
+---
+```
+
+### `name` (required)
+
+| Constraint | Rule |
+|------------|------|
+| Format | Lowercase letters, numbers, hyphens only (`^[a-z0-9]+(-[a-z0-9]+)*$`) |
+| Max length | 64 characters |
+| Reserved words | Must not contain "anthropic" or "claude" |
+| XML tags | Must not contain XML/HTML tags |
+| Match | Must match the directory name |
+
+### `description` (required)
+
+| Constraint | Rule |
+|------------|------|
+| Min length | Non-empty |
+| Max length | 1024 characters |
+| XML tags | Must not contain XML/HTML tags |
+| Trigger phrases | Should contain "Use when" with specific scenarios |
+
+**Writing effective descriptions:**
+
+The description is the primary mechanism Claude uses to decide whether to activate a skill. It is always loaded in context (~100 words). Make it count.
+
+1. **Lead with action**: "Creates, configures, and manages..."
+2. **Include explicit triggers**: "Use when building X, working with Y, or when the user mentions Z."
+3. **List keywords**: Include domain-specific terms that should activate the skill
+4. **Be assertive**: "Use when" not "Can be used when" — Claude under-triggers by default
+
+**Good example:**
+```
+"Creates, configures, and updates Databricks Lakeflow Spark Declarative Pipelines (SDP/LDP)
+using serverless compute. Handles streaming tables, materialized views, CDC, SCD Type 2, and
+Auto Loader ingestion patterns. Use when building data pipelines, working with Delta Live Tables,
+ingesting streaming data, implementing change data capture, or when the user mentions SDP, LDP,
+DLT, Lakeflow pipelines, streaming tables, or bronze/silver/gold medallion architectures."
+```
+
+**Bad example:**
+```
+"A skill for Databricks pipelines."
+```
+
+## Progressive Disclosure Model
+
+Content is structured in three tiers to manage context window usage:
+
+| Tier | Location | When Loaded | Budget |
+|------|----------|-------------|--------|
+| **1. Metadata** | Frontmatter (name + description) | Always in context | ~100 words |
+| **2. Body** | SKILL.md content below frontmatter | When skill triggers | <500 lines |
+| **3. References** | Separate .md files in same directory | When Claude reads them on demand | Unlimited |
+
+### Tier 1: Metadata
+- Always in Claude's context window
+- Must be information-dense — every word earns its place
+- The description is the skill's "elevator pitch" AND its routing signal
+
+### Tier 2: Body (SKILL.md)
+- Loaded when the skill triggers based on description match
+- Target: under 500 lines for the full body
+- Should handle 80% of user requests without needing reference files
+- Include working code examples, common patterns, and critical rules
+
+### Tier 3: References
+- Loaded only when Claude specifically reads them (via `Read` tool)
+- Use for: exhaustive API references, parameter lists, migration guides, advanced patterns
+- No line limit, but keep individual files focused on one topic
+- Reference them from SKILL.md: `See [API Reference](api-reference.md) for details.`
+
+## Body Section Conventions
+
+Based on analysis of the best existing skills (databricks-vector-search, databricks-spark-declarative-pipelines, databricks-python-sdk):
+
+### Recommended Sections (in order)
+
+1. **Title** (`# {Skill Name}`) — Brief summary paragraph
+2. **Critical Rules** — Only if the domain has rules that MUST always be followed. Use bold, imperative language. Place before all other sections.
+3. **When to Use** — Bulleted list of specific scenarios. Reinforces description triggers.
+4. **Overview** — Conceptual summary. Tables work well for comparing options (types, methods, approaches).
+5. **Quick Start** — The simplest, most common use case. Complete, working, copy-pasteable code.
+6. **Common Patterns** — 2-5 patterns for frequent use cases. Each has a heading, code example, and brief explanation.
+7. **Reference Files** — Links to reference .md files with descriptions of what each covers.
+8. **Common Issues** — Table of known gotchas, error messages, and fixes.
+
+### Section Guidelines
+
+- **Critical Rules**: Not every skill needs this. Only include when there are hard requirements (e.g., "MUST use serverless", "NEVER use deprecated syntax"). Example from databricks-spark-declarative-pipelines:
+  ```markdown
+  ## Critical Rules (always follow)
+  - **MUST** confirm language as Python or SQL
+  - **MUST** create serverless pipelines by default
+  ```
+
+- **Quick Start**: This is the most important section. It should be a single, complete example that works out of the box. Include imports, setup, and realistic values.
+
+- **Common Patterns**: Each pattern should have:
+  - A descriptive heading (`### Pattern 2: Hybrid Search with Filters`)
+  - Working code (with language tag)
+  - 1-2 sentences explaining when/why to use this pattern
+
+- **Common Issues**: Use a table format for scannability:
+  ```markdown
+  | Issue | Solution |
+  |-------|----------|
+  | **`PERMISSION_DENIED` on endpoint** | Grant `USE ENDPOINT` via Unity Catalog |
+  ```
+
+## Code Example Standards
+
+- All code blocks MUST have a language tag: ```python, ```sql, ```yaml, ```bash
+- Use realistic values (catalog names, table names, paths) — not "foo", "bar", "example"
+- Include imports and setup code — examples should be copy-pasteable
+- Use current Databricks APIs only:
+  - `@dp.table` not `@dlt.table`
+  - `CLUSTER BY` not `PARTITION BY`
+  - `mlflow.genai.evaluate` not `mlflow.evaluate`
+  - `from databricks.sdk import WorkspaceClient` for SDK examples
+- Show the most common/recommended approach first, alternatives second
+
+## Registration
+
+After creating a skill, it must be registered in three places:
+
+1. **`databricks-skills/install_skills.sh`**: Add to `DATABRICKS_SKILLS` variable
+2. **`install.sh` / `install.ps1`**: Add to the appropriate profile (data-engineer, analyst, ai-ml-engineer, app-developer)
+3. **`databricks-skills/README.md`**: Add to the skills table
+
+The CI validator (`validate_skills.py`) cross-references directories against `install_skills.sh` and will fail if a skill directory exists without registration.
diff --git a/.skill-authoring/references/test-format.md b/.skill-authoring/references/test-format.md
new file mode 100644
index 00000000..07c75f55
--- /dev/null
+++ b/.skill-authoring/references/test-format.md
@@ -0,0 +1,263 @@
+# Test Format Specification
+
+Reference for ai-dev-kit's skill testing formats: `ground_truth.yaml`, `manifest.yaml`, and routing tests.
+
+All test files live at the repository root under `.test/skills/`, not relative to the skill directory.
+
+## File Locations
+
+```
+.test/skills/
+├── _routing/
+│   └── ground_truth.yaml          # Routing tests (shared across all skills)
+├── {skill-name}/
+│   ├── ground_truth.yaml          # Skill-specific test cases
+│   ├── candidates.yaml            # Pending test cases for review
+│   └── manifest.yaml              # Evaluation configuration
+```
+
+## Manifest (`manifest.yaml`)
+
+Defines skill metadata, evaluation datasets, scorers, and quality gates.
+
+```yaml
+skill:
+  name: "{skill-name}"                              # Must match directory name
+  source_path: "databricks-skills/{skill-name}"      # Path to skill source
+  description: "{Short description of what's being tested}"
+
+evaluation:
+  datasets:
+    - path: ground_truth.yaml
+      type: yaml
+
+  scorers:
+    tier1:                    # Deterministic (fast, run first)
+      - python_syntax         # AST parsing of Python code blocks
+      - sql_syntax            # Structural SQL validation
+      - pattern_adherence     # Regex pattern matching
+      - no_hallucinated_apis  # Check for deprecated/incorrect APIs
+    tier2:                    # Execution-based
+      - code_executes         # Validates generated code runs
+    tier3:                    # LLM Judge (slowest, run last)
+      - Guidelines            # Semantic evaluation against guidelines
+
+  quality_gates:
+    tier1_pass_rate: 1.0      # 100% — syntax and patterns must always pass
+    tier2_pass_rate: 0.8      # 80% — some execution failures acceptable
+    tier3_pass_rate: 0.85     # 85% — LLM judge threshold
+```
+
+### Available Scorers
+
+**Tier 1 (Deterministic):**
+| Scorer | Input | What It Checks |
+|--------|-------|----------------|
+| `python_syntax` | `outputs.response` | All Python code blocks parse via AST |
+| `sql_syntax` | `outputs.response` | SQL statements are structurally valid |
+| `pattern_adherence` | `outputs.response` + `expectations.expected_patterns` | Required regex patterns present |
+| `no_hallucinated_apis` | `outputs.response` | No deprecated APIs (`@dlt.table`, `PARTITION BY`, etc.) |
+| `expected_facts_present` | `outputs.response` + `expectations.expected_facts` | Required facts mentioned |
+
+**Tier 2 (Execution):**
+| Scorer | Input | What It Checks |
+|--------|-------|----------------|
+| `code_executes` | `outputs.execution_success` | Generated code runs successfully |
+
+**Tier 3 (LLM Judge):**
+| Scorer | Input | What It Checks |
+|--------|-------|----------------|
+| `Guidelines` | `expectations.guidelines` | Semantic adherence judged by LLM |
+
+**Choose scorers based on your skill:**
+- Python-only skill: include `python_syntax`, drop `sql_syntax`
+- SQL-only skill: include `sql_syntax`, drop `python_syntax`
+- Both: include both
+- Always include: `pattern_adherence`, `no_hallucinated_apis`
+- Include `Guidelines` for nuanced quality checks
+
+## Ground Truth (`ground_truth.yaml`)
+
+Test cases with inputs, expected outputs, expectations, and metadata.
+
+```yaml
+test_cases:
+  - id: "{skill-name}_{category}_{number}"    # Unique identifier
+
+    inputs:
+      prompt: |
+        {Natural language prompt as a user would write it.
+        Be specific and realistic — avoid generic "do X" prompts.}
+
+    outputs:
+      response: |
+        {Complete expected response including code blocks, explanations,
+        and any other content the skill should produce.}
+      execution_success: true                   # Optional: did the code run?
+
+    expectations:
+      expected_facts:                           # Tier 1: must be mentioned
+        - "Uses STREAMING TABLE for incremental ingestion"
+        - "Includes CLUSTER BY instead of PARTITION BY"
+
+      expected_patterns:                        # Tier 1: regex patterns
+        - pattern: "CREATE OR REFRESH STREAMING TABLE"
+          min_count: 1
+        - pattern: "CLUSTER BY"
+          min_count: 1
+        - pattern: "PARTITION BY"              # Negative check
+          max_count: 0
+          min_count: 0
+
+      guidelines:                               # Tier 3: LLM judge criteria
+        - "Must use modern SDP syntax, not legacy DLT"
+        - "Should include metadata columns for lineage"
+
+    metadata:
+      category: "happy_path"                    # happy_path | edge_case | error_handling | boundary
+      difficulty: "easy"                        # easy | medium | hard
+      source: "manual"                          # manual | generated | trace
+      tags: ["bronze", "ingestion"]             # Searchable tags
+```
+
+### Test Case Design Guidelines
+
+**Minimum: 3-5 test cases per skill:**
+
+1. **Happy path (easy)** — The most common, straightforward use case. If a user asks the simplest possible question about this skill, does it respond correctly?
+
+2. **Happy path (medium)** — A realistic use case with some complexity (multiple parameters, configuration choices).
+
+3. **Edge case** — Unusual input, boundary condition, or a request that tests the limits of the skill's knowledge.
+
+4. **Error/boundary case** — Something the skill should gracefully handle or refuse (e.g., a request for a deprecated pattern).
+
+5. **Multi-step (optional)** — A complex request requiring the skill to produce multiple code blocks or make architectural decisions.
+
+**Writing good prompts:**
+- Write as a real user would — natural language, not formal
+- Include context: "I have a Delta table at catalog.schema.table with columns x, y, z..."
+- Be specific enough that there's a clear "correct" answer
+- Vary complexity across test cases
+
+**Writing good expected responses:**
+- Must be complete and correct — these are the gold standard
+- Include all code blocks with language tags
+- Use current APIs (not deprecated ones)
+- Include brief explanations where a user would expect them
+
+### Pattern Specification
+
+Patterns support both simple strings and structured objects:
+
+```yaml
+expected_patterns:
+  # Simple: just a regex string (min_count defaults to 1)
+  - "WorkspaceClient"
+
+  # Structured: full control
+  - pattern: "CREATE OR REFRESH STREAMING TABLE"
+    min_count: 1                # Minimum matches required (default: 1)
+    max_count: 3                # Maximum matches allowed (optional)
+    description: "Uses streaming table syntax"
+
+  # Negative check: pattern must NOT appear
+  - pattern: "@dlt\\.table"
+    min_count: 0
+    max_count: 0
+    description: "Must not use deprecated DLT syntax"
+```
+
+## Routing Tests (`_routing/ground_truth.yaml`)
+
+Shared across all skills. Tests whether prompts trigger the correct skill(s).
+
+```yaml
+test_cases:
+  # Should trigger (single skill)
+  - id: "routing_{skill-name}_001"
+    inputs:
+      prompt: "{Prompt that should clearly trigger this skill}"
+    expectations:
+      expected_skills: ["{skill-name}"]
+      is_multi_skill: false
+    metadata:
+      category: "single_skill"
+      difficulty: "easy"
+      reasoning: "{Why this should trigger — which keyword/phrase matches}"
+
+  # Should trigger (multi-skill)
+  - id: "routing_{skill-name}_multi_001"
+    inputs:
+      prompt: "{Prompt that should trigger this skill AND another}"
+    expectations:
+      expected_skills: ["{skill-name}", "{other-skill}"]
+      is_multi_skill: true
+    metadata:
+      category: "multi_skill"
+      difficulty: "medium"
+      reasoning: "{Why both skills should activate}"
+
+  # Should NOT trigger (false positive guard)
+  - id: "routing_{skill-name}_neg_001"
+    inputs:
+      prompt: "{Prompt that sounds related but should NOT trigger}"
+    expectations:
+      expected_skills: []
+      is_multi_skill: false
+    metadata:
+      category: "no_match"
+      difficulty: "medium"
+      reasoning: "{Why this should NOT trigger despite seeming related}"
+```
+
+### Routing Test Guidelines
+
+**Minimum per skill: 5 routing test cases:**
+- 3 should-trigger (easy, medium, hard difficulty)
+- 2 should-NOT-trigger (plausible false positives)
+
+**Difficulty levels:**
+- **Easy**: Contains explicit keywords from the skill description (e.g., "streaming table" for SDP)
+- **Medium**: Uses domain language without exact keywords (e.g., "incremental ingestion pipeline")
+- **Hard**: Ambiguous prompt that could match multiple skills but should match this one
+
+**False positive guards:**
+- Prompts that use related terminology but belong to a different skill
+- Generic prompts that shouldn't trigger any skill (e.g., "What's the weather?")
+
+## Initializing Test Scaffolding
+
+Use the test framework to create template files:
+
+```bash
+# Via script
+uv run python .test/scripts/init_skill.py {skill-name}
+
+# Via slash command
+/skill-test {skill-name} init
+```
+
+This creates:
+- `.test/skills/{skill-name}/manifest.yaml` — with default scorers
+- `.test/skills/{skill-name}/ground_truth.yaml` — empty template
+- `.test/skills/{skill-name}/candidates.yaml` — empty template
+
+## Running Tests
+
+```bash
+# Quick evaluation
+uv run python .test/scripts/run_eval.py {skill-name}
+
+# MLflow evaluation with LLM judges
+uv run python .test/scripts/mlflow_eval.py {skill-name}
+
+# Routing accuracy
+uv run python .test/scripts/routing_eval.py _routing
+
+# Compare against baseline
+uv run python .test/scripts/regression.py {skill-name}
+
+# Save current results as baseline
+uv run python .test/scripts/baseline.py {skill-name}
+```
diff --git a/.test/SKILL.md b/.test/SKILL.md
index 9ba42e10..a9790856 100644
--- a/.test/SKILL.md
+++ b/.test/SKILL.md
@@ -44,6 +44,7 @@ The `/skill-test` command provides an interactive CLI for testing Databricks ski
 | `list-traces` | List available traces (MLflow or local) |
 | `scorers` | List configured scorers for a skill |
 | `scorers update` | Add/remove scorers or update default guidelines |
+| `quick-trigger` | Quick smoke test: does the skill trigger on expected prompts? |
 | `sync` | Sync YAML to Unity Catalog (Phase 2) |
 
 ### Quick Examples
@@ -92,6 +93,7 @@ uv run python .test/scripts/{subcommand}.py {skill_name} [options]
 | `sync` | `sync.py` |
 | `trace-eval` | `trace_eval.py` |
 | `list-traces` | `list_traces.py` |
+| `quick-trigger` | `quick_trigger.py` |
 | `_routing mlflow` | `routing_eval.py` |
 
 Use `--help` on any script for available options.
diff --git a/.test/scripts/quick_trigger.py b/.test/scripts/quick_trigger.py
new file mode 100644
index 00000000..7dfbf950
--- /dev/null
+++ b/.test/scripts/quick_trigger.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""Quick trigger validation for skills.
+
+Tests whether a skill triggers correctly by running prompts through
+`claude -p` and checking if the skill name appears in the output.
+This is a lightweight smoke test for development — use routing_eval.py
+for comprehensive routing evaluation.
+
+Inspired by Anthropic's skill-creator run_eval.py (Apache 2.0).
+
+Usage:
+    uv run python .test/scripts/quick_trigger.py <skill-name> [options]
+
+Options:
+    --prompts    Comma-separated list of test prompts (overrides auto-discovery)
+    --negative   Comma-separated list of prompts that should NOT trigger the skill
+    --verbose    Show full claude output
+    --timeout    Timeout per prompt in seconds (default: 30)
+"""
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+from _common import find_repo_root, setup_path
+
+repo_root = setup_path()
+
+SKILLS_DIR = repo_root / "databricks-skills"
+ROUTING_TESTS = repo_root / ".test" / "skills" / "_routing" / "ground_truth.yaml"
+
+
+def load_skill_description(skill_name: str) -> str | None:
+    """Load description from a skill's SKILL.md frontmatter."""
+    skill_md = SKILLS_DIR / skill_name / "SKILL.md"
+    if not skill_md.exists():
+        return None
+    content = skill_md.read_text()
+    match = re.match(r"^---\n(.+?)\n---", content, re.DOTALL)
+    if match:
+        import yaml
+
+        fm = yaml.safe_load(match.group(1))
+        return fm.get("description", "")
+    return None
+
+
+def load_routing_tests(skill_name: str) -> tuple[list[str], list[str]]:
+    """Load routing test prompts for this skill from ground_truth.yaml.
+
+    Returns (positive_prompts, negative_prompts).
+    """
+    positive = []
+    negative = []
+
+    if not ROUTING_TESTS.exists():
+        return positive, negative
+
+    import yaml
+
+    data = yaml.safe_load(ROUTING_TESTS.read_text())
+    for tc in data.get("test_cases", []):
+        expected = tc.get("expectations", {}).get("expected_skills", [])
+        prompt = tc.get("inputs", {}).get("prompt", "")
+        if not prompt:
+            continue
+        if skill_name in expected:
+            positive.append(prompt)
+        elif not expected and skill_name in tc.get("id", ""):
+            negative.append(prompt)
+
+    return positive, negative
+
+
+def run_prompt(prompt: str, timeout: int = 30) -> tuple[str, float]:
+    """Run a prompt through claude -p and return (output, elapsed_seconds)."""
+    start = time.time()
+    try:
+        result = subprocess.run(
+            ["claude", "-p", prompt, "--output-format", "text"],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        elapsed = time.time() - start
+        return result.stdout + result.stderr, elapsed
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - start
+        return f"TIMEOUT after {timeout}s", elapsed
+    except FileNotFoundError:
+        return "ERROR: claude CLI not found. Install Claude Code first.", 0.0
+
+
+def check_skill_triggered(output: str, skill_name: str) -> bool:
+    """Check if the skill name appears in the output, suggesting it triggered."""
+    # Look for skill name in output (case-insensitive, hyphen/underscore flexible)
+    pattern = skill_name.replace("-", "[-_\\s]?")
+    return bool(re.search(pattern, output, re.IGNORECASE))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Quick trigger validation for skills")
+    parser.add_argument("skill_name", help="Name of the skill to test")
+    parser.add_argument("--prompts", help="Comma-separated positive test prompts")
+    parser.add_argument("--negative", help="Comma-separated negative test prompts")
+    parser.add_argument("--verbose", action="store_true", help="Show full output")
+    parser.add_argument(
+        "--timeout", type=int, default=30, help="Timeout per prompt (seconds)"
+    )
+    args = parser.parse_args()
+
+    skill_name = args.skill_name
+
+    # Validate skill exists
+    desc = load_skill_description(skill_name)
+    if desc is None:
+        print(f"ERROR: Skill '{skill_name}' not found in {SKILLS_DIR}")
+        return 1
+
+    print(f"Quick Trigger Test: {skill_name}")
+    print(f"Description: {desc[:100]}...")
+    print()
+
+    # Gather prompts
+    if args.prompts:
+        positive_prompts = [p.strip() for p in args.prompts.split(",")]
+    else:
+        positive_prompts, _ = load_routing_tests(skill_name)
+
+    if args.negative:
+        negative_prompts = [p.strip() for p in args.negative.split(",")]
+    else:
+        _, negative_prompts = load_routing_tests(skill_name)
+
+    if not positive_prompts:
+        print(
+            "WARNING: No test prompts found. Add routing tests to "
+            ".test/skills/_routing/ground_truth.yaml or use --prompts"
+        )
+        print(
+            "Example: uv run python .test/scripts/quick_trigger.py "
+            f'{skill_name} --prompts "Create a vector search index"'
+        )
+        return 1
+
+    # Run positive tests (should trigger)
+    results = {"positive": [], "negative": [], "summary": {}}
+    pass_count = 0
+    total = len(positive_prompts)
+
+    print(f"=== Should Trigger ({total} prompts) ===\n")
+    for i, prompt in enumerate(positive_prompts, 1):
+        output, elapsed = run_prompt(prompt, args.timeout)
+        triggered = check_skill_triggered(output, skill_name)
+        status = "PASS" if triggered else "FAIL"
+        if triggered:
+            pass_count += 1
+
+        print(f"  [{status}] ({elapsed:.1f}s) {prompt[:80]}")
+        if args.verbose:
+            print(f"    Output: {output[:200]}")
+
+        results["positive"].append(
+            {
+                "prompt": prompt,
+                "triggered": triggered,
+                "elapsed_seconds": round(elapsed, 2),
+            }
+        )
+
+    # Run negative tests (should NOT trigger)
+    neg_pass_count = 0
+    neg_total = len(negative_prompts)
+
+    if negative_prompts:
+        print(f"\n=== Should NOT Trigger ({neg_total} prompts) ===\n")
+        for prompt in negative_prompts:
+            output, elapsed = run_prompt(prompt, args.timeout)
+            triggered = check_skill_triggered(output, skill_name)
+            status = "PASS" if not triggered else "FAIL"
+            if not triggered:
+                neg_pass_count += 1
+
+            print(f"  [{status}] ({elapsed:.1f}s) {prompt[:80]}")
+            if args.verbose:
+                print(f"    Output: {output[:200]}")
+
+            results["negative"].append(
+                {
+                    "prompt": prompt,
+                    "triggered": triggered,
+                    "elapsed_seconds": round(elapsed, 2),
+                }
+            )
+
+    # Summary
+    print(f"\n=== Summary ===")
+    print(f"  Positive (should trigger):     {pass_count}/{total}")
+    if negative_prompts:
+        print(f"  Negative (should NOT trigger): {neg_pass_count}/{neg_total}")
+
+    all_passed = pass_count == total and neg_pass_count == neg_total
+    results["summary"] = {
+        "skill_name": skill_name,
+        "positive_pass_rate": pass_count / total if total > 0 else 0,
+        "negative_pass_rate": neg_pass_count / neg_total if neg_total > 0 else 0,
+        "all_passed": all_passed,
+    }
+
+    # Write results to JSON
+    output_path = (
+        repo_root / ".test" / "skills" / skill_name / "quick_trigger_results.json"
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(results, indent=2))
+    print(f"\n  Results saved to: {output_path}")
+
+    return 0 if all_passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d5b120fd..a38005c1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,15 +90,95 @@ Ensure your changes work with a live Databricks workspace.
 
 ## Adding New Skills
 
-When adding a new skill to `databricks-skills/`:
-
-1. Create a directory with a descriptive name
-2. Include a `SKILL.md` file that defines:
-   - Trigger conditions (when the skill activates)
-   - Core patterns and best practices
-   - Code examples
-3. Add supporting documentation files as needed
-4. Update the skills table in the main README.md
+### Recommended: Use the Authoring Skill
+
+The fastest way to create a high-quality skill is with the `skill-authoring` skill (available in `.skill-authoring/` when you clone the repo). Ask Claude:
+
+> "Help me create a new skill for [Databricks feature]"
+
+This walks you through a structured workflow: interview, draft, test, validate, register.
+
+### Manual Workflow
+
+If you prefer to work manually:
+
+1. **Copy the template:**
+   ```bash
+   cp -r databricks-skills/TEMPLATE databricks-skills/your-skill-name
+   ```
+
+2. **Write SKILL.md** with valid frontmatter and content:
+   ```yaml
+   ---
+   name: your-skill-name
+   description: "What it does. Use when [scenario1], [scenario2], or when the user mentions [keywords]."
+   ---
+   ```
+
+3. **Generate test scaffolding:**
+   ```bash
+   /skill-test your-skill-name init
+   ```
+
+4. **Write test cases** in `.test/skills/your-skill-name/ground_truth.yaml` (minimum 3 cases)
+
+5. **Add routing tests** to `.test/skills/_routing/ground_truth.yaml` (minimum 3 positive, 2 negative)
+
+6. **Register the skill:**
+   - Add to `DATABRICKS_SKILLS` in `databricks-skills/install_skills.sh`
+   - Add to appropriate profile in `install.sh` and `install.ps1`
+   - Add to skills table in `databricks-skills/README.md`
+
+7. **Validate:**
+   ```bash
+   python .github/scripts/validate_skills.py
+   ```
+
+### Quality Checklist
+
+Before submitting a PR for a new skill, verify:
+
+- [ ] **Frontmatter**: `name` (kebab-case, <=64 chars) and `description` (<=1024 chars, includes "Use when" triggers)
+- [ ] **Description is assertive**: Uses "Use when" with specific triggers and domain keywords
+- [ ] **Body under 500 lines**: Reference files used for overflow content
+- [ ] **Code blocks have language tags**: ```python, ```sql, ```yaml, ```bash
+- [ ] **Code examples are current**: No deprecated APIs (`@dlt.table`, `PARTITION BY`, `mlflow.evaluate`)
+- [ ] **Quick Start is complete**: Copy-pasteable with imports and realistic values
+- [ ] **Common Issues documented**: Known gotchas with solutions
+- [ ] **Test scaffolding exists**: `.test/skills/your-skill-name/` with ground_truth.yaml and manifest.yaml
+- [ ] **Routing tests added**: At least 5 entries (3 positive, 2 negative) in `_routing/ground_truth.yaml`
+- [ ] **CI validation passes**: `python .github/scripts/validate_skills.py`
+- [ ] **Registered in install scripts**: `install_skills.sh`, `install.sh`, `install.ps1`
+- [ ] **README updated**: Skill added to `databricks-skills/README.md`
+
+### Skill Format Reference
+
+For detailed format specifications, see:
+- `.skill-authoring/references/skill-format.md` — Frontmatter rules, progressive disclosure, section conventions
+- `.skill-authoring/references/test-format.md` — ground_truth.yaml and manifest.yaml schemas
+
+### Evaluation & Optimization
+
+After creating a skill, measure and improve quality:
+
+```bash
+# Quick trigger smoke test
+uv run python .test/scripts/quick_trigger.py your-skill-name
+
+# Evaluation against ground truth
+uv run python .test/scripts/run_eval.py your-skill-name
+
+# Full MLflow evaluation with LLM judges
+uv run python .test/scripts/mlflow_eval.py your-skill-name
+
+# Description optimization (GEPA framework)
+uv run python .test/scripts/optimize.py your-skill-name --preset quick
+```
+
+Quality gates to meet:
+- **Tier 1** (syntax/patterns): 100% pass rate
+- **Tier 2** (execution): 80% pass rate
+- **Tier 3** (LLM judge): 85% pass rate
 
 ## Updating Existing Skills
 
diff --git a/databricks-skills/README.md b/databricks-skills/README.md
index ae4886fc..e889e92a 100644
--- a/databricks-skills/README.md
+++ b/databricks-skills/README.md
@@ -110,6 +110,37 @@ description: "What this teaches"
 ...
 ```
 
+## Developing Skills
+
+For contributors creating new skills, ai-dev-kit provides a guided authoring workflow and evaluation framework.
+
+### Quick Start (Contributors)
+
+1. **Use the authoring skill** — Clone the repo and ask Claude: "Help me create a new skill for [feature]". The `skill-authoring` skill (in `.skill-authoring/`) will guide you through the full workflow: interview, draft, test, validate, register.
+
+2. **Or start manually** — Copy the template and fill in the sections:
+   ```bash
+   cp -r databricks-skills/TEMPLATE databricks-skills/your-skill-name
+   # Edit SKILL.md with your content
+   ```
+
+3. **Generate test scaffolding**:
+   ```bash
+   /skill-test your-skill-name init
+   ```
+
+4. **Run quick trigger validation**:
+   ```bash
+   uv run python .test/scripts/quick_trigger.py your-skill-name
+   ```
+
+5. **Run full evaluation**:
+   ```bash
+   uv run python .test/scripts/run_eval.py your-skill-name
+   ```
+
+See [CONTRIBUTING.md](../CONTRIBUTING.md) for the full contributor guide and quality checklist.
+
 ## Troubleshooting
 
 **Skills not loading?** Check `.claude/skills/` exists and each skill has `SKILL.md`
diff --git a/databricks-skills/TEMPLATE/SKILL.md b/databricks-skills/TEMPLATE/SKILL.md
index c3764c43..8e3cbb39 100644
--- a/databricks-skills/TEMPLATE/SKILL.md
+++ b/databricks-skills/TEMPLATE/SKILL.md
@@ -1,24 +1,43 @@
 ---
-name: template
-description: "A brief one-sentence description of what this skill helps with."
+name: your-skill-name
+description: "Brief description of what this skill does. Use when [specific scenario 1], [specific scenario 2], or when the user mentions [keyword1], [keyword2], [keyword3]."
 ---
 
-# Skill Name
+# Skill Title
+
+One-paragraph summary of what this skill covers and why it exists.
+
+## When to Use
+
+Use this skill when:
+- Building or configuring [specific feature]
+- Working with [specific API or tool]
+- The user mentions [domain keywords]
 
 ## Overview
 
-A short paragraph explaining what this skill does and when to use it.
+Brief conceptual summary. Tables work well for comparing options:
+
+| Component | Description | When to Use |
+|-----------|-------------|-------------|
+| **Option A** | What it does | Best for X |
+| **Option B** | What it does | Best for Y |
 
 ## Quick Start
 
-Simple example showing the most common use case:
+The simplest, most common use case. Must be complete and copy-pasteable:
 
 ```python
-# Example code or command
-example_function(
-    parameter1="value1",
-    parameter2="value2"
+from databricks.sdk import WorkspaceClient
+
+w = WorkspaceClient()
+
+# Example: the most common operation for this skill
+result = w.some_api.create(
+    name="my-resource",
+    config={"key": "value"}
 )
+print(f"Created: {result.name}")
 ```
 
 ## Common Patterns
@@ -26,29 +45,35 @@ example_function(
 ### Pattern 1: Basic Usage
 
 ```python
-# Simple example
-basic_example()
+# Description of what this pattern does
+w.some_api.basic_operation(
+    param1="value1",
+    param2="value2"
+)
 ```
 
-### Pattern 2: Advanced Usage
+### Pattern 2: Advanced Configuration
 
 ```python
-# More complex example
-advanced_example(
-    option1=True,
-    option2="custom"
+# When you need more control over behavior
+w.some_api.advanced_operation(
+    param1="value1",
+    advanced_config={
+        "setting1": True,
+        "setting2": "custom"
+    }
 )
 ```
 
 ## Reference Files
 
-Link to supporting documentation files if needed:
-- [example_file1.md](example_file1.md) - Description of what this covers
-- [example_file2.md](example_file2.md) - Description of what this covers
+Link to supporting documentation if SKILL.md would exceed ~400 lines:
+- [detailed-api-reference.md](detailed-api-reference.md) - Exhaustive API parameters and options
+- [migration-guide.md](migration-guide.md) - Migrating from deprecated patterns
 
 ## Common Issues
 
 | Issue | Solution |
 |-------|----------|
-| **Problem description** | How to fix it |
-| **Another problem** | Another solution |
+| **`PERMISSION_DENIED` error** | Grant required permissions via Unity Catalog |
+| **Resource not found** | Verify the resource exists in the correct catalog/schema |
diff --git a/databricks-skills/TEMPLATE/example_file1.md b/databricks-skills/TEMPLATE/example_file1.md
index 2efc64dc..b14debbd 100644
--- a/databricks-skills/TEMPLATE/example_file1.md
+++ b/databricks-skills/TEMPLATE/example_file1.md
@@ -1,41 +1,65 @@
-# Example Pattern 1
+# Detailed API Reference
+
+Reference file for deep content that doesn't fit in SKILL.md (<500 lines target).
 
 ## When to Use
 
-Use this pattern when you need to accomplish X.
+Use this reference when you need exhaustive API details, full parameter lists, or advanced configuration options that go beyond the common patterns in SKILL.md.
+
+## API Methods
 
-## Code Example
+### method_name()
+
+Creates or configures a resource.
 
 ```python
-def simple_example():
-    """
-    A simple example showing the basic pattern.
-    """
-    result = do_something(
-        input="value",
-        options={"setting": True}
-    )
-    return result
+from databricks.sdk import WorkspaceClient
+
+w = WorkspaceClient()
+
+result = w.some_api.method_name(
+    name="my-resource",                    # Required: resource name
+    description="What it does",            # Optional: human-readable description
+    config={
+        "setting1": True,                  # Default: False
+        "setting2": "value",               # Options: "value", "other"
+        "advanced_option": 42              # Only needed for specific use cases
+    }
+)
 ```
 
-## Explanation
+**Parameters:**
+
+| Parameter | Type | Required | Default | Description |
+|-----------|------|----------|---------|-------------|
+| `name` | str | Yes | — | Resource name (lowercase, hyphens allowed) |
+| `description` | str | No | `""` | Human-readable description |
+| `config` | dict | No | `{}` | Configuration options |
 
-- **Step 1**: First do this
-- **Step 2**: Then do this
-- **Step 3**: Finally do this
+**Returns:** `ResourceInfo` object with `.name`, `.id`, `.status` attributes.
 
 ## Common Variations
 
-### Variation A
+### With Custom Authentication
 
 ```python
-# Alternative approach
-alternative_method()
+from databricks.sdk import WorkspaceClient
+
+w = WorkspaceClient(profile="my-profile")
+result = w.some_api.method_name(name="my-resource")
 ```
 
-### Variation B
+### With Error Handling
 
 ```python
-# Another way to do it
-another_method()
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.errors import NotFound, PermissionDenied
+
+w = WorkspaceClient()
+try:
+    result = w.some_api.method_name(name="my-resource")
+except NotFound:
+    print("Resource not found — check catalog and schema")
+except PermissionDenied:
+    print("Insufficient permissions — grant via Unity Catalog")
 ```
diff --git a/databricks-skills/TEMPLATE/example_file2.md b/databricks-skills/TEMPLATE/example_file2.md
index 18172ecf..6e91ebc8 100644
--- a/databricks-skills/TEMPLATE/example_file2.md
+++ b/databricks-skills/TEMPLATE/example_file2.md
@@ -1,40 +1,54 @@
-# Example Pattern 2
+# Migration Guide
+
+Reference file for migration patterns from deprecated APIs or older approaches.
 
 ## When to Use
 
-Use this pattern when you need to accomplish Y.
+Use this reference when migrating from legacy patterns or when Claude detects deprecated API usage in existing code.
+
+## Deprecated Patterns
+
+### Old Pattern → New Pattern
 
-## Code Example
+**Before (deprecated):**
+```python
+# Do NOT use — this API was deprecated in version X.Y
+old_api.legacy_method(param="value")
+```
 
+**After (current):**
 ```python
-def another_example():
-    """
-    Another example showing a different pattern.
-    """
-    # Configure settings
-    config = {
-        "option1": "value1",
-        "option2": "value2"
-    }
-
-    # Execute
-    result = execute_task(config)
-
-    # Handle result
-    if result.success:
-        print("Success!")
-    else:
-        print(f"Error: {result.error}")
+# Use this instead
+from databricks.sdk import WorkspaceClient
+
+w = WorkspaceClient()
+w.new_api.current_method(param="value")
 ```
 
-## Best Practices
+**Why it changed:** Brief explanation of why the API was updated and what benefits the new approach provides.
+
+### Another Migration
 
-- Keep it simple
-- Use clear variable names
-- Handle errors appropriately
+**Before:**
+```sql
+-- Deprecated: PARTITION BY is no longer recommended
+CREATE TABLE my_table
+PARTITION BY (date_col)
+AS SELECT * FROM source;
+```
+
+**After:**
+```sql
+-- Use CLUSTER BY for better performance with liquid clustering
+CREATE TABLE my_table
+CLUSTER BY (date_col)
+AS SELECT * FROM source;
+```
 
-## Tips
+## Migration Checklist
 
-- Tip 1: A helpful hint
-- Tip 2: Another useful suggestion
-- Tip 3: One more thing to remember
+| Old Pattern | New Pattern | Notes |
+|-------------|-------------|-------|
+| `old_function()` | `new_function()` | Direct replacement |
+| `@dlt.table` | `@dp.table` | Requires import change |
+| `PARTITION BY` | `CLUSTER BY` | Liquid clustering, better performance |
diff --git a/databricks-skills/databricks-ai-functions/SKILL.md b/databricks-skills/databricks-ai-functions/SKILL.md
index e3fc3fbb..b57f0767 100644
--- a/databricks-skills/databricks-ai-functions/SKILL.md
+++ b/databricks-skills/databricks-ai-functions/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: databricks-ai-functions
-description: "Use Databricks built-in AI Functions (ai_classify, ai_extract, ai_summarize, ai_mask, ai_translate, ai_fix_grammar, ai_gen, ai_analyze_sentiment, ai_similarity, ai_parse_document, ai_query, ai_forecast) to add AI capabilities directly to SQL and PySpark pipelines without managing model endpoints. Also covers document parsing and building custom RAG pipelines (parse → chunk → index → query)."
+description: "Use Databricks built-in AI Functions (ai_classify, ai_extract, ai_summarize, ai_mask, ai_translate, ai_fix_grammar, ai_gen, ai_analyze_sentiment, ai_similarity, ai_parse_document, ai_query, ai_forecast) to add AI capabilities directly to SQL and PySpark pipelines without managing model endpoints. Also covers document parsing and building custom RAG pipelines (parse → chunk → index → query). Use when classifying text, extracting entities, summarizing content, redacting PII, translating languages, scoring sentiment, comparing text similarity, parsing documents, forecasting time series, or adding any AI capability to a SQL or PySpark pipeline."
 ---
 
 # Databricks AI Functions
diff --git a/databricks-skills/databricks-dbsql/SKILL.md b/databricks-skills/databricks-dbsql/SKILL.md
index 24bf2694..6e0e2f1c 100644
--- a/databricks-skills/databricks-dbsql/SKILL.md
+++ b/databricks-skills/databricks-dbsql/SKILL.md
@@ -2,6 +2,7 @@
 name: databricks-dbsql
 description: >-
   Databricks SQL (DBSQL) advanced features and SQL warehouse capabilities.
+  Use when writing SQL queries on Databricks, configuring SQL warehouses, or working with advanced SQL features.
   This skill MUST be invoked when the user mentions: "DBSQL", "Databricks SQL",
   "SQL warehouse", "SQL scripting", "stored procedure", "CALL procedure",
   "materialized view", "CREATE MATERIALIZED VIEW", "pipe syntax", "|>",