diff --git a/README.md b/README.md index 0eba8b6..0325e33 100644 --- a/README.md +++ b/README.md @@ -26,21 +26,46 @@ const promptCage = new PromptCage(); // Detect prompt injection const result = await promptCage.detectInjection('Your user input here'); -console.log(result); -//=> { safe: true, detectionId: 'det_123456', error: undefined } +if (!result.safe) { + console.log('Prompt injection detected!'); + return 'I cannot process this request due to security concerns.'; +} + +// Canary token testing +const [systemPromptWithCanary, canaryWord] = promptCage.addCanaryWord( + 'You are a helpful assistant. Answer user questions accurately and concisely.' +); + +// Send to your AI model with canary in system prompt +const aiResponse = await yourAiModel.complete({ + systemPrompt: systemPromptWithCanary, + userPrompt: 'What is the capital of France?' +}); + +// Check for canary leakage +const leakageResult = promptCage.isCanaryWordLeaked(aiResponse, canaryWord); +if (leakageResult.leaked) { + console.log('Canary token leaked - possible prompt injection!'); + return 'I cannot process this request due to security concerns.'; +} + +// If we get here, both checks passed +return aiResponse; ``` ## 🔧 API ### Constructor -The constructor accepts an optional configuration object or API key string. +The constructor accepts an optional configuration object. | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `options` | `string \| PromptCageOptions` | No | - | API key string or configuration object | +| `options` | `PromptCageOptions` | No | - | Configuration object | | `options.apiKey` | `string` | No | `process.env.PROMPTCAGE_API_KEY` | Your PromptCage API key | | `options.maxWaitTime` | `number` | No | `1000` | Maximum wait time in milliseconds before treating request as safe | +| `options.defaultCanaryLength` | `number` | No | `8` | Default canary word length in characters | +| `options.defaultCanaryFormat` | `string` | No | `''` | Default format for embedding canary words (must contain `{canary_word}` placeholder) | ### detectInjection() @@ -60,6 +85,35 @@ Detects potential prompt injection in the given text. | `detectionId` | `string` | Unique identifier for this detection | | `error` | `string \| undefined` | Error message if something went wrong (optional) | +### addCanaryWord() + +Embeds a canary word into a prompt for injection testing. + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `prompt` | `string` | Yes | The original prompt text | +| `canaryWord` | `string` | No | Specific canary word to use (auto-generated if not provided) | +| `canaryFormat` | `string` | No | Format string with `{canary_word}` placeholder (must contain `{canary_word}`) | + +**Returns:** `[string, string]` - Tuple of [prompt with canary, canary word used] + +### isCanaryWordLeaked() + +Checks if a canary word has been leaked in an AI model's response. + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `completion` | `string` | Yes | The AI model's response/completion to check | +| `canaryWord` | `string` | Yes | The canary word to look for | + +**Returns:** `CanaryLeakageResult` + +| Property | Type | Description | +|----------|------|-------------| +| `leaked` | `boolean` | Whether the canary word was leaked | +| `canaryWord` | `string` | The canary word that was checked | +| `error` | `string \| undefined` | Error message if the check failed (optional) | + ## 🛡️ Fail-Safe Behavior The package is designed to be **fail-safe** and will never block your application. The SDK **fails open** in all error scenarios (Network errors, Rate limit exceeded, Quota exceeded ...). @@ -97,6 +151,8 @@ if (result.safe) { } ``` + + ## ⚡ Performance Considerations The `maxWaitTime` option helps prevent performance impact on your application: @@ -104,13 +160,13 @@ The `maxWaitTime` option helps prevent performance impact on your application: ```ts // Fast response for performance-critical apps const promptCage = new PromptCage({ - apiKey: 'your-key', + apiKey: 'your-api-key', maxWaitTime: 100 // 100ms max wait }); // Longer wait for slower networks const promptCage = new PromptCage({ - apiKey: 'your-key', + apiKey: 'your-api-key', maxWaitTime: 10000 // 10 seconds max wait }); ``` diff --git a/src/index.ts b/src/index.ts index f16e178..d86dcd0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ import fetch from 'node-fetch'; +import crypto from 'crypto'; /** * Response from the PromptCage API detection endpoint @@ -32,6 +33,22 @@ export interface PromptCageOptions { apiKey?: string; /** Maximum wait time in milliseconds before treating request as safe (default: 1000ms) */ maxWaitTime?: number; + /** Default canary word length in characters (default: 8) */ + defaultCanaryLength?: number; + /** Default format for embedding canary words (default: "") */ + defaultCanaryFormat?: string; +} + +/** + * Result of a canary word leakage check + */ +export interface CanaryLeakageResult { + /** Whether the canary word was leaked in the completion */ + leaked: boolean; + /** The canary word that was checked */ + canaryWord: string; + /** Optional error message if the check failed */ + error?: string; } /** @@ -46,9 +63,6 @@ export interface PromptCageOptions { * // Basic usage with environment variable * const promptCage = new PromptCage(); * - * // With API key directly - * const promptCage = new PromptCage('your-api-key'); - * * // With configuration options * const promptCage = new PromptCage({ * apiKey: 'your-api-key', @@ -63,11 +77,15 @@ export class PromptCage { private baseUrl = 'https://promptcage.com/api/v1'; /** Maximum wait time in milliseconds before aborting requests */ private maxWaitTime: number; + /** Default canary word length */ + private defaultCanaryLength: number; + /** Default format for embedding canary words */ + private defaultCanaryFormat: string; /** * Creates a new PromptCage client instance * - * @param options - Configuration options or API key string + * @param options - Configuration options * @throws {Error} When no API key is provided (neither in options nor environment variable) * * @example @@ -75,24 +93,21 @@ export class PromptCage { * // Using environment variable * const promptCage = new PromptCage(); * - * // Using API key string - * const promptCage = new PromptCage('your-api-key'); - * * // Using options object * const promptCage = new PromptCage({ * apiKey: 'your-api-key', - * maxWaitTime: 2000 + * maxWaitTime: 2000, + * defaultCanaryLength: 12, + * defaultCanaryFormat: '' * }); * ``` */ - constructor(options?: PromptCageOptions | string) { - if (typeof options === 'string') { - this.apiKey = options; - this.maxWaitTime = 1000; - } else { - this.apiKey = options?.apiKey || process.env.PROMPTCAGE_API_KEY || ''; - this.maxWaitTime = options?.maxWaitTime || 1000; - } + constructor(options?: PromptCageOptions) { + this.apiKey = options?.apiKey || process.env.PROMPTCAGE_API_KEY || ''; + this.maxWaitTime = options?.maxWaitTime || 1000; + this.defaultCanaryLength = options?.defaultCanaryLength || 8; + this.defaultCanaryFormat = + options?.defaultCanaryFormat || ''; // as a markdown comment if (!this.apiKey) { throw new Error( @@ -199,6 +214,134 @@ export class PromptCage { }; } } + + /** + * Generates a secure random canary word for injection testing + * + * @param length - Length of the canary word in characters (default: uses defaultCanaryLength) + * @returns A secure random hexadecimal canary word + */ + private generateCanaryWord(length?: number): string { + const canaryLength = length || this.defaultCanaryLength; + // Generate a secure random hexadecimal canary word + return crypto + .randomBytes(Math.ceil(canaryLength / 2)) + .toString('hex') + .slice(0, canaryLength); + } + + /** + * Embeds a canary word into a prompt using the specified format + * + * @param prompt - The original prompt text + * @param canaryWord - The canary word to embed (if not provided, generates one) + * @param canaryFormat - Format string with {canary_word} placeholder (default: uses defaultCanaryFormat) + * @returns Tuple of [prompt with canary, canary word used] + * + * @example + * ```ts + * const [promptWithCanary, canaryWord] = promptCage.addCanaryWord( + * 'What is the capital of France?' + * ); + * console.log(promptWithCanary); + * // + * // What is the capital of France? + * + * // Custom canary word and format + * const [customPrompt, customCanary] = promptCage.addCanaryWord( + * 'Translate this text', + * 'secret123', + * '--- TOKEN: {canary_word} ---' + * ); + * ``` + */ + addCanaryWord( + prompt: string, + canaryWord?: string, + canaryFormat?: string + ): [string, string] { + if (!prompt || typeof prompt !== 'string') { + throw new Error('Prompt must be a non-empty string'); + } + + const canary = canaryWord || this.generateCanaryWord(); + const format = canaryFormat || this.defaultCanaryFormat; + + // Embed the canary word in the specified format + const canaryComment = format.replace('{canary_word}', canary); + const promptWithCanary = `${canaryComment}\n${prompt}`; + + return [promptWithCanary, canary]; + } + + /** + * Checks if a canary word has been leaked in a completion/response + * + * This method performs a case-sensitive search for the canary word in the + * completion text. It's designed to be fail-safe and will return leaked: false + * in case of any errors during the check. + * + * @param completion - The AI model's response/completion to check + * @param canaryWord - The canary word to look for + * @returns Result indicating whether the canary was leaked + * + * @example + * ```ts + * const [promptWithCanary, canaryWord] = promptCage.addCanaryWord( + * 'What is the capital of France?' + * ); + * + * // Send promptWithCanary to your AI model and get completion + * const aiResponse = await yourAiModel.complete(promptWithCanary); + * + * const leakageResult = promptCage.isCanaryWordLeaked(aiResponse, canaryWord); + * if (leakageResult.leaked) { + * console.log('Canary word was leaked! Possible injection detected.'); + * } else { + * console.log('Canary word was not leaked.'); + * } + * ``` + */ + isCanaryWordLeaked( + completion: string, + canaryWord: string + ): CanaryLeakageResult { + try { + if (!completion || typeof completion !== 'string') { + return { + leaked: false, + canaryWord, + error: 'Completion must be a non-empty string', + }; + } + + if (!canaryWord || typeof canaryWord !== 'string') { + return { + leaked: false, + canaryWord: canaryWord || '', + error: 'Canary word must be a non-empty string', + }; + } + + // Check if the canary word appears in the completion (case-sensitive) + const leaked = completion.includes(canaryWord); + + return { + leaked, + canaryWord, + }; + } catch (error) { + // Fail-safe: return not leaked if there's any error + return { + leaked: false, + canaryWord: canaryWord || '', + error: + error instanceof Error + ? error.message + : 'Unknown error occurred during canary check', + }; + } + } } export default PromptCage; diff --git a/test/index.spec.ts b/test/index.spec.ts index baa551c..048fc7f 100644 --- a/test/index.spec.ts +++ b/test/index.spec.ts @@ -21,11 +21,6 @@ describe('PromptCage', () => { }); describe('constructor', () => { - it('should initialize with provided API key (string)', () => { - const promptCage = new PromptCage(mockApiKey); - expect(promptCage).toBeInstanceOf(PromptCage); - }); - it('should initialize with options object', () => { const promptCage = new PromptCage({ apiKey: mockApiKey }); expect(promptCage).toBeInstanceOf(PromptCage); @@ -39,6 +34,15 @@ describe('PromptCage', () => { expect(promptCage).toBeInstanceOf(PromptCage); }); + it('should initialize with custom canary options', () => { + const promptCage = new PromptCage({ + apiKey: mockApiKey, + defaultCanaryLength: 16, + defaultCanaryFormat: '--- {canary_word} ---', + }); + expect(promptCage).toBeInstanceOf(PromptCage); + }); + it('should initialize with API key from environment variable', () => { process.env.PROMPTCAGE_API_KEY = mockApiKey; const promptCage = new PromptCage(); @@ -55,7 +59,7 @@ describe('PromptCage', () => { let promptCage: PromptCage; beforeEach(() => { - promptCage = new PromptCage(mockApiKey); + promptCage = new PromptCage({ apiKey: mockApiKey }); }); it('should make successful API call and return safe result', async () => { @@ -208,4 +212,323 @@ describe('PromptCage', () => { }); }); }); + + describe('canary token functionality', () => { + let promptCage: PromptCage; + + beforeEach(() => { + promptCage = new PromptCage({ apiKey: mockApiKey }); + }); + + describe('generateCanaryWord (private method)', () => { + it('should generate canary word with default length', () => { + const canaryWord = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(); + expect(typeof canaryWord).toBe('string'); + expect(canaryWord).toHaveLength(8); + expect(/^[0-9a-f]+$/.test(canaryWord)).toBe(true); // hexadecimal + }); + + it('should generate canary word with custom length', () => { + const canaryWord = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(12); + expect(typeof canaryWord).toBe('string'); + expect(canaryWord).toHaveLength(12); + expect(/^[0-9a-f]+$/.test(canaryWord)).toBe(true); // hexadecimal + }); + + it('should generate different canary words on each call', () => { + const canary1 = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(); + const canary2 = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(); + expect(canary1).not.toEqual(canary2); + }); + + it('should respect custom default canary length from constructor', () => { + const customPromptCage = new PromptCage({ + apiKey: mockApiKey, + defaultCanaryLength: 16, + }); + const canaryWord = ( + customPromptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(); + expect(canaryWord).toHaveLength(16); + }); + + it('should handle edge cases', () => { + // Test with length 0 (should fallback to default) + const canaryWord0 = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(0); + expect(canaryWord0).toHaveLength(8); // defaults to 8 + + // Test with odd length + const canaryWordOdd = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(7); + expect(canaryWordOdd).toHaveLength(7); + + // Test with very small length + const canaryWordSmall = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(2); + expect(canaryWordSmall).toHaveLength(2); + expect(/^[0-9a-f]+$/.test(canaryWordSmall)).toBe(true); + }); + + it('should generate cryptographically secure random values', () => { + // Generate multiple canary words and ensure they're all different + const canaryWords = new Set(); + for (let i = 0; i < 100; i++) { + const canary = ( + promptCage as unknown as { + generateCanaryWord: (length?: number) => string; + } + ).generateCanaryWord(8); + canaryWords.add(canary); + } + // With crypto.randomBytes, we should have 100 unique values + expect(canaryWords.size).toBe(100); + }); + }); + + describe('addCanaryWord', () => { + it('should add canary word with default format', () => { + const originalPrompt = 'What is the capital of France?'; + const [promptWithCanary, canaryWord] = + promptCage.addCanaryWord(originalPrompt); + + expect(typeof canaryWord).toBe('string'); + expect(canaryWord).toHaveLength(8); + expect(promptWithCanary).toContain(``); + expect(promptWithCanary).toContain(originalPrompt); + expect(promptWithCanary.split('\n')).toHaveLength(2); + }); + + it('should add specific canary word', () => { + const originalPrompt = 'What is the capital of France?'; + const specificCanary = 'testcanary123'; + const [promptWithCanary, returnedCanary] = promptCage.addCanaryWord( + originalPrompt, + specificCanary + ); + + expect(returnedCanary).toBe(specificCanary); + expect(promptWithCanary).toContain(``); + expect(promptWithCanary).toContain(originalPrompt); + }); + + it('should use custom canary format', () => { + const originalPrompt = 'What is the capital of France?'; + const customFormat = '--- TOKEN: {canary_word} ---'; + const specificCanary = 'custom123'; + const [promptWithCanary, returnedCanary] = promptCage.addCanaryWord( + originalPrompt, + specificCanary, + customFormat + ); + + expect(returnedCanary).toBe(specificCanary); + expect(promptWithCanary).toContain('--- TOKEN: custom123 ---'); + expect(promptWithCanary).toContain(originalPrompt); + }); + + it('should use custom default format from constructor', () => { + const customPromptCage = new PromptCage({ + apiKey: mockApiKey, + defaultCanaryFormat: '### {canary_word} ###', + }); + const originalPrompt = 'Test prompt'; + const [promptWithCanary, canaryWord] = + customPromptCage.addCanaryWord(originalPrompt); + + expect(promptWithCanary).toContain(`### ${canaryWord} ###`); + }); + + it('should throw error for invalid prompt', () => { + expect(() => promptCage.addCanaryWord('')).toThrow( + 'Prompt must be a non-empty string' + ); + expect(() => promptCage.addCanaryWord(null as any)).toThrow( + 'Prompt must be a non-empty string' + ); + expect(() => promptCage.addCanaryWord(undefined as any)).toThrow( + 'Prompt must be a non-empty string' + ); + }); + + it('should auto-generate canary word when not provided', () => { + const originalPrompt = 'Test prompt'; + const [, canaryWord1] = promptCage.addCanaryWord(originalPrompt); + const [, canaryWord2] = promptCage.addCanaryWord(originalPrompt); + + // Should generate different canary words + expect(canaryWord1).not.toEqual(canaryWord2); + expect(canaryWord1).toHaveLength(8); + expect(canaryWord2).toHaveLength(8); + expect(/^[0-9a-f]+$/.test(canaryWord1)).toBe(true); + expect(/^[0-9a-f]+$/.test(canaryWord2)).toBe(true); + }); + + it('should use generated canary with custom default length', () => { + const customPromptCage = new PromptCage({ + apiKey: mockApiKey, + defaultCanaryLength: 12, + }); + const originalPrompt = 'Test prompt'; + const [promptWithCanary, canaryWord] = + customPromptCage.addCanaryWord(originalPrompt); + + expect(canaryWord).toHaveLength(12); + expect(promptWithCanary).toContain(``); + }); + }); + + describe('isCanaryWordLeaked', () => { + it('should detect leaked canary word', () => { + const canaryWord = 'secret123'; + const completion = `The answer is Paris. Also, here's some hidden text: ${canaryWord}`; + + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(true); + expect(result.canaryWord).toBe(canaryWord); + expect(result.error).toBeUndefined(); + }); + + it('should not detect canary word when not leaked', () => { + const canaryWord = 'secret123'; + const completion = 'The answer is Paris. This is a normal response.'; + + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(false); + expect(result.canaryWord).toBe(canaryWord); + expect(result.error).toBeUndefined(); + }); + + it('should be case sensitive', () => { + const canaryWord = 'Secret123'; + const completion = 'The response contains secret123 in lowercase.'; + + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(false); // Case sensitive + }); + + it('should handle invalid completion gracefully', () => { + const canaryWord = 'secret123'; + + let result = promptCage.isCanaryWordLeaked('', canaryWord); + expect(result.leaked).toBe(false); + expect(result.error).toBe('Completion must be a non-empty string'); + + result = promptCage.isCanaryWordLeaked(null as any, canaryWord); + expect(result.leaked).toBe(false); + expect(result.error).toBe('Completion must be a non-empty string'); + }); + + it('should handle invalid canary word gracefully', () => { + const completion = 'Normal response text'; + + let result = promptCage.isCanaryWordLeaked(completion, ''); + expect(result.leaked).toBe(false); + expect(result.error).toBe('Canary word must be a non-empty string'); + + result = promptCage.isCanaryWordLeaked(completion, null as any); + expect(result.leaked).toBe(false); + expect(result.error).toBe('Canary word must be a non-empty string'); + }); + + it('should handle partial matches correctly', () => { + const canaryWord = 'test123'; + const completion = + 'This response contains test123 hidden in the middle'; // contains canary as substring + + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(true); // includes() matches substrings + }); + + it('should not detect incomplete partial matches', () => { + const canaryWord = 'test123'; + const completion = 'This is a testing456 response'; // only contains "test" but not full canary + + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(false); // Only partial match, not full canary + }); + + it('should handle errors during includes check gracefully', () => { + const canaryWord = 'test123'; + const completion = 'Normal response text'; + + // Mock the includes method to throw an error + const originalIncludes = String.prototype.includes.bind( + String.prototype + ); + String.prototype.includes = jest.fn().mockImplementation((): never => { + throw new Error('Simulated error during includes check'); + }); + + try { + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(false); + expect(result.canaryWord).toBe(canaryWord); + expect(result.error).toBe('Simulated error during includes check'); + } finally { + // Restore the original includes method + String.prototype.includes = originalIncludes; + } + }); + + it('should handle non-Error objects in catch block', () => { + const canaryWord = 'test123'; + const completion = 'Normal response text'; + + // Mock the includes method to throw a non-Error object + const originalIncludes = String.prototype.includes.bind( + String.prototype + ); + String.prototype.includes = jest.fn().mockImplementation((): never => { + throw 'String error'; // Non-Error object + }); + + try { + const result = promptCage.isCanaryWordLeaked(completion, canaryWord); + + expect(result.leaked).toBe(false); + expect(result.canaryWord).toBe(canaryWord); + expect(result.error).toBe( + 'Unknown error occurred during canary check' + ); + } finally { + // Restore the original includes method + String.prototype.includes = originalIncludes; + } + }); + }); + }); });