Kilo-Org · sebastiand-cerebras · Dec 3, 2025 · Dec 4, 2025 · Dec 5, 2025
diff --git a/.changeset/cerebras-conservative-max-tokens.md b/.changeset/cerebras-conservative-max-tokens.md
@@ -0,0 +1,13 @@
+---
+"kilo-code": patch
+---
+
+fix(cerebras): use conservative max_tokens and add integration header
+
+**Conservative max_tokens:**
+Cerebras rate limiter estimates token consumption using max_completion_tokens upfront rather than actual usage. When agentic tools automatically set this to the model maximum (e.g., 64K), users exhaust their quota prematurely and get rate-limited despite minimal actual token consumption.
+
+This fix uses a conservative default of 8K tokens instead of the model maximum. This is sufficient for most agentic tool use while preserving rate limit headroom.
+
+**Integration header:**
+Added `X-Cerebras-3rd-Party-Integration: kilocode` header to all Cerebras API requests for tracking and analytics.
diff --git a/src/api/providers/cerebras.ts b/src/api/providers/cerebras.ts
@@ -16,6 +16,16 @@ import { t } from "../../i18n"
 const CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1"
 const CEREBRAS_DEFAULT_TEMPERATURE = 0
 
+/**
+ * Conservative max_tokens for Cerebras to avoid premature rate limiting.
+ * Cerebras rate limiter estimates token consumption using max_completion_tokens upfront,
+ * so requesting the model maximum (e.g., 64K) reserves that quota even if actual usage is low.
+ * 8K is sufficient for most agentic tool use while preserving rate limit headroom.
+ */
+const CEREBRAS_DEFAULT_MAX_TOKENS = 8_192
+const CEREBRAS_INTEGRATION_HEADER = "X-Cerebras-3rd-Party-Integration"
+const CEREBRAS_INTEGRATION_NAME = "kilocode"
+
 /**
  * Removes thinking tokens from text to prevent model confusion when processing conversation history.
  * This is crucial because models can get confused by their own thinking tokens in input.
@@ -131,12 +141,14 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan
 		const cerebrasMessages = convertToCerebrasMessages(openaiMessages)
 
 		// Prepare request body following Cerebras API specification exactly
+		// Use conservative default to avoid premature rate limiting (Cerebras reserves quota upfront)
+		const effectiveMaxTokens = Math.min(max_tokens || CEREBRAS_DEFAULT_MAX_TOKENS, CEREBRAS_DEFAULT_MAX_TOKENS)
 		const requestBody = {
 			model,
 			messages: [{ role: "system", content: systemPrompt }, ...cerebrasMessages],
 			stream: true,
 			// Use max_completion_tokens (Cerebras-specific parameter)
-			...(max_tokens && max_tokens > 0 && max_tokens <= 32768 ? { max_completion_tokens: max_tokens } : {}),
+			...(effectiveMaxTokens > 0 ? { max_completion_tokens: effectiveMaxTokens } : {}),
 			// Clamp temperature to Cerebras range (0 to 1.5)
 			...(temperature !== undefined && temperature !== CEREBRAS_DEFAULT_TEMPERATURE
 				? {
@@ -152,6 +164,7 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan
 					...DEFAULT_HEADERS,
 					"Content-Type": "application/json",
 					Authorization: `Bearer ${this.apiKey}`,
+					[CEREBRAS_INTEGRATION_HEADER]: CEREBRAS_INTEGRATION_NAME,
 				},
 				body: JSON.stringify(requestBody),
 			})
@@ -294,6 +307,7 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan
 					...DEFAULT_HEADERS,
 					"Content-Type": "application/json",
 					Authorization: `Bearer ${this.apiKey}`,
+					[CEREBRAS_INTEGRATION_HEADER]: CEREBRAS_INTEGRATION_NAME,
 				},
 				body: JSON.stringify(requestBody),
 			})