diff --git a/.changeset/green-colts-kiss.md b/.changeset/green-colts-kiss.md new file mode 100644 index 000000000..53ed5727e --- /dev/null +++ b/.changeset/green-colts-kiss.md @@ -0,0 +1,5 @@ +--- +'@tanstack/ai-groq': minor +--- + +Add tree-shakeable Text-to-Speech (TTS) adapter for Groq API with English and Arabic voices, multiple output formats (default WAV), configurable speed and sample rate, new types, model metadata, and unit tests. diff --git a/packages/typescript/ai-groq/src/adapters/tts.ts b/packages/typescript/ai-groq/src/adapters/tts.ts new file mode 100644 index 000000000..3ec5808f3 --- /dev/null +++ b/packages/typescript/ai-groq/src/adapters/tts.ts @@ -0,0 +1,159 @@ +import { BaseTTSAdapter } from '@tanstack/ai/adapters' +import { createGroqClient, generateId, getGroqApiKeyFromEnv } from '../utils' +import { validateAudioInput } from '../audio/audio-provider-options' +import type { GroqTTSModel } from '../model-meta' +import type { + GroqTTSFormat, + GroqTTSProviderOptions, + GroqTTSVoice, +} from '../audio/tts-provider-options' +import type { TTSOptions, TTSResult } from '@tanstack/ai' +import type Groq_SDK from 'groq-sdk' +import type { GroqClientConfig } from '../utils' + +/** + * Configuration for Groq TTS adapter + */ +export interface GroqTTSConfig extends GroqClientConfig {} + +/** + * Groq Text-to-Speech Adapter + * + * Tree-shakeable adapter for Groq TTS functionality. + * Supports canopylabs/orpheus-v1-english and canopylabs/orpheus-arabic-saudi models. + * + * Features: + * - English voices: autumn(f), diana(f), hannah(f), austin(m), daniel(m), troy(m) + * - Arabic voices: fahad(m), sultan(m), lulwa(f), noura(f) + * - Output formats: flac, mp3, mulaw, ogg, wav (only wav currently supported) + * - Speed control + * - Configurable sample rate + * - Vocal direction support (English voices only) + */ +export class GroqTTSAdapter extends BaseTTSAdapter< + TModel, + GroqTTSProviderOptions +> { + readonly name = 'groq' as const + + private client: Groq_SDK + + constructor(config: GroqTTSConfig, model: TModel) { + super(config, model) + this.client = createGroqClient(config) + } + + async generateSpeech( + options: TTSOptions, + ): Promise { + const { + model, + text, + voice = 'autumn', + format = 'wav', + speed, + modelOptions, + } = options + + validateAudioInput({ input: text, model }) + + const voiceFormat = format as GroqTTSFormat + + const request: Groq_SDK.Audio.Speech.SpeechCreateParams = { + model, + input: text, + voice: voice as GroqTTSVoice, + response_format: voiceFormat, + speed, + ...modelOptions, + } + + const response = await this.client.audio.speech.create(request) + + const arrayBuffer = await response.arrayBuffer() + const base64 = Buffer.from(arrayBuffer).toString('base64') + + const contentType = this.getContentType(voiceFormat) + + return { + id: generateId(this.name), + model, + audio: base64, + format: voiceFormat, + contentType, + } + } + + private getContentType(format: string): string { + const contentTypes: Record = { + flac: 'audio/flac', + mp3: 'audio/mpeg', + mulaw: 'audio/basic', + ogg: 'audio/ogg', + wav: 'audio/wav', + } + return contentTypes[format] || 'audio/wav' + } +} + +/** + * Creates a Groq speech adapter with explicit API key. + * Type resolution happens here at the call site. + * + * @param model - The model name (e.g., 'canopylabs/orpheus-v1-english') + * @param apiKey - Your Groq API key + * @param config - Optional additional configuration + * @returns Configured Groq speech adapter instance with resolved types + * + * @example + * ```typescript + * const adapter = createGroqSpeech('canopylabs/orpheus-v1-english', "gsk_..."); + * + * const result = await generateSpeech({ + * adapter, + * text: 'Hello, world!', + * voice: 'autumn' + * }); + * ``` + */ +export function createGroqSpeech( + model: TModel, + apiKey: string, + config?: Omit, +): GroqTTSAdapter { + return new GroqTTSAdapter({ apiKey, ...config }, model) +} + +/** + * Creates a Groq speech adapter with automatic API key detection from environment variables. + * Type resolution happens here at the call site. + * + * Looks for `GROQ_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (Browser with injected env) + * + * @param model - The model name (e.g., 'canopylabs/orpheus-v1-english') + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured Groq speech adapter instance with resolved types + * @throws Error if GROQ_API_KEY is not found in environment + * + * @example + * ```typescript + * // Automatically uses GROQ_API_KEY from environment + * const adapter = groqSpeech('canopylabs/orpheus-v1-english'); + * + * const result = await generateSpeech({ + * adapter, + * text: 'Welcome to TanStack AI!', + * voice: 'autumn', + * format: 'wav' + * }); + * ``` + */ +export function groqSpeech( + model: TModel, + config?: Omit, +): GroqTTSAdapter { + const apiKey = getGroqApiKeyFromEnv() + return createGroqSpeech(model, apiKey, config) +} diff --git a/packages/typescript/ai-groq/src/audio/audio-provider-options.ts b/packages/typescript/ai-groq/src/audio/audio-provider-options.ts new file mode 100644 index 000000000..d0a5d3ffe --- /dev/null +++ b/packages/typescript/ai-groq/src/audio/audio-provider-options.ts @@ -0,0 +1,25 @@ +/** + * Common audio provider options for Groq audio endpoints. + */ +export interface AudioProviderOptions { + /** + * The text to generate audio for. + * Maximum length is 200 characters. + * Use [directions] for vocal control (English voices only). + */ + input: string + /** + * The audio model to use for generation. + */ + model: string +} + +/** + * Validates that the audio input text does not exceed the maximum length. + * @throws Error if input text exceeds 200 characters + */ +export const validateAudioInput = (options: AudioProviderOptions) => { + if (options.input.length > 200) { + throw new Error('Input text exceeds maximum length of 200 characters.') + } +} diff --git a/packages/typescript/ai-groq/src/audio/tts-provider-options.ts b/packages/typescript/ai-groq/src/audio/tts-provider-options.ts new file mode 100644 index 000000000..8ab8f3c46 --- /dev/null +++ b/packages/typescript/ai-groq/src/audio/tts-provider-options.ts @@ -0,0 +1,49 @@ +/** + * Groq TTS voice options for English models + */ +export type GroqTTSEnglishVoice = + | 'autumn' + | 'diana' + | 'hannah' + | 'austin' + | 'daniel' + | 'troy' + +/** + * Groq TTS voice options for Arabic models + */ +export type GroqTTSArabicVoice = 'fahad' | 'sultan' | 'lulwa' | 'noura' + +/** + * Union of all Groq TTS voice options + */ +export type GroqTTSVoice = GroqTTSEnglishVoice | GroqTTSArabicVoice + +/** + * Groq TTS output format options. + * Only wav is currently supported. + */ +export type GroqTTSFormat = 'flac' | 'mp3' | 'mulaw' | 'ogg' | 'wav' + +/** + * Groq TTS sample rate options + */ +export type GroqTTSSampleRate = + | 8000 + | 16000 + | 22050 + | 24000 + | 32000 + | 44100 + | 48000 + +/** + * Provider-specific options for Groq TTS. + * These options are passed via `modelOptions` when calling `generateSpeech`. + */ +export interface GroqTTSProviderOptions { + /** + * The sample rate of the generated audio in Hz. + */ + sample_rate?: GroqTTSSampleRate +} diff --git a/packages/typescript/ai-groq/src/index.ts b/packages/typescript/ai-groq/src/index.ts index ff2d02872..40da99b1c 100644 --- a/packages/typescript/ai-groq/src/index.ts +++ b/packages/typescript/ai-groq/src/index.ts @@ -2,7 +2,7 @@ * @module @tanstack/ai-groq * * Groq provider adapter for TanStack AI. - * Provides tree-shakeable adapters for Groq's Chat Completions API. + * Provides tree-shakeable adapters for Groq's Chat Completions API and TTS API. */ // Text (Chat) adapter @@ -14,15 +14,33 @@ export { type GroqTextProviderOptions, } from './adapters/text' +// TTS adapter - for text-to-speech +export { + GroqTTSAdapter, + createGroqSpeech, + groqSpeech, + type GroqTTSConfig, +} from './adapters/tts' +export type { + GroqTTSProviderOptions, + GroqTTSVoice, + GroqTTSEnglishVoice, + GroqTTSArabicVoice, + GroqTTSFormat, + GroqTTSSampleRate, +} from './audio/tts-provider-options' + // Types export type { GroqChatModelProviderOptionsByName, + GroqTTSModelProviderOptionsByName, GroqModelInputModalitiesByName, ResolveProviderOptions, ResolveInputModalities, GroqChatModels, + GroqTTSModel, } from './model-meta' -export { GROQ_CHAT_MODELS } from './model-meta' +export { GROQ_CHAT_MODELS, GROQ_TTS_MODELS } from './model-meta' export type { GroqTextMetadata, GroqImageMetadata, diff --git a/packages/typescript/ai-groq/src/model-meta.ts b/packages/typescript/ai-groq/src/model-meta.ts index 83ce38800..65452fd29 100644 --- a/packages/typescript/ai-groq/src/model-meta.ts +++ b/packages/typescript/ai-groq/src/model-meta.ts @@ -1,4 +1,5 @@ import type { GroqTextProviderOptions } from './text/text-provider-options' +import type { GroqTTSProviderOptions } from './audio/tts-provider-options' /** * Internal metadata structure describing a Groq model's capabilities and pricing. @@ -351,14 +352,23 @@ export type GroqChatModelProviderOptionsByName = { [K in (typeof GROQ_CHAT_MODELS)[number]]: GroqTextProviderOptions } +/** + * Type-only map from Groq TTS model name to its provider options type. + */ +export type GroqTTSModelProviderOptionsByName = { + [K in GroqTTSModel]: GroqTTSProviderOptions +} + /** * Resolves the provider options type for a specific Groq model. - * Falls back to generic GroqTextProviderOptions for unknown models. + * Checks TTS models first, then chat models, then falls back to generic options. */ export type ResolveProviderOptions = - TModel extends keyof GroqChatModelProviderOptionsByName - ? GroqChatModelProviderOptionsByName[TModel] - : GroqTextProviderOptions + TModel extends GroqTTSModel + ? GroqTTSProviderOptions + : TModel extends keyof GroqChatModelProviderOptionsByName + ? GroqChatModelProviderOptionsByName[TModel] + : GroqTextProviderOptions /** * Resolve input modalities for a specific model. @@ -368,3 +378,50 @@ export type ResolveInputModalities = TModel extends keyof GroqModelInputModalitiesByName ? GroqModelInputModalitiesByName[TModel] : readonly ['text'] + +// ============================================================================ +// TTS Models +// ============================================================================ + +const ORPHEUS_V1_ENGLISH = { + name: 'canopylabs/orpheus-v1-english', + pricing: { + input: { + normal: 22, + }, + }, + supports: { + input: ['text'], + output: ['audio'], + endpoints: ['tts'], + features: [], + }, +} as const satisfies ModelMeta + +const ORPHEUS_ARABIC_SAUDI = { + name: 'canopylabs/orpheus-arabic-saudi', + pricing: { + input: { + normal: 40, + }, + }, + supports: { + input: ['text'], + output: ['audio'], + endpoints: ['tts'], + features: [], + }, +} as const satisfies ModelMeta + +/** + * All supported Groq TTS model identifiers. + */ +export const GROQ_TTS_MODELS = [ + ORPHEUS_V1_ENGLISH.name, + ORPHEUS_ARABIC_SAUDI.name, +] as const + +/** + * Union type of all supported Groq TTS model names. + */ +export type GroqTTSModel = (typeof GROQ_TTS_MODELS)[number] diff --git a/packages/typescript/ai-groq/src/utils/schema-converter.ts b/packages/typescript/ai-groq/src/utils/schema-converter.ts index d0a57cf44..28af19f1e 100644 --- a/packages/typescript/ai-groq/src/utils/schema-converter.ts +++ b/packages/typescript/ai-groq/src/utils/schema-converter.ts @@ -50,10 +50,7 @@ export function makeGroqStructuredOutputCompatible( ): Record { const result = { ...schema } - if (result.type === 'object') { - if (!result.properties) { - result.properties = {} - } + if (result.type === 'object' && result.properties) { const properties = { ...result.properties } const allPropertyNames = Object.keys(properties) @@ -96,6 +93,7 @@ export function makeGroqStructuredOutputCompatible( } else { delete result.required } + result.required = allPropertyNames result.additionalProperties = false } diff --git a/packages/typescript/ai-groq/tests/groq-tts.test.ts b/packages/typescript/ai-groq/tests/groq-tts.test.ts new file mode 100644 index 000000000..4195a4141 --- /dev/null +++ b/packages/typescript/ai-groq/tests/groq-tts.test.ts @@ -0,0 +1,264 @@ +import { + describe, + it, + expect, + vi, + afterEach, + beforeEach, + type Mock, +} from 'vitest' +import { createGroqSpeech, groqSpeech } from '../src/adapters/tts' +import type { TTSResult } from '@tanstack/ai' + +// Declare mockCreate at module level +let mockSpeechCreate: Mock<(...args: Array) => unknown> + +// Mock the Groq SDK +vi.mock('groq-sdk', () => { + return { + default: class { + audio = { + speech: { + create: (...args: Array) => mockSpeechCreate(...args), + }, + } + }, + } +}) + +// Helper to create a mock audio response +function createMockAudioResponse(audioContent = 'mock-audio-data') { + const encoder = new TextEncoder() + const buffer = encoder.encode(audioContent) + return { + arrayBuffer: () => Promise.resolve(buffer.buffer), + } +} + +describe('Groq TTS adapter', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + afterEach(() => { + vi.unstubAllEnvs() + }) + + describe('Adapter creation', () => { + it('creates a TTS adapter with explicit API key', () => { + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + expect(adapter).toBeDefined() + expect(adapter.kind).toBe('tts') + expect(adapter.name).toBe('groq') + expect(adapter.model).toBe('canopylabs/orpheus-v1-english') + }) + + it('creates a TTS adapter from environment variable', () => { + vi.stubEnv('GROQ_API_KEY', 'env-api-key') + + const adapter = groqSpeech('canopylabs/orpheus-arabic-saudi') + + expect(adapter).toBeDefined() + expect(adapter.kind).toBe('tts') + expect(adapter.model).toBe('canopylabs/orpheus-arabic-saudi') + }) + + it('throws if GROQ_API_KEY is not set when using groqSpeech', () => { + vi.stubEnv('GROQ_API_KEY', '') + + expect(() => groqSpeech('canopylabs/orpheus-v1-english')).toThrow( + 'GROQ_API_KEY is required', + ) + }) + + it('allows custom baseURL override', () => { + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + { + baseURL: 'https://custom.api.example.com/v1', + }, + ) + + expect(adapter).toBeDefined() + }) + }) + + describe('generateSpeech', () => { + it('generates speech and returns base64 audio', async () => { + mockSpeechCreate = vi + .fn() + .mockResolvedValueOnce(createMockAudioResponse('test-audio-bytes')) + + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + const result: TTSResult = await adapter.generateSpeech({ + model: 'canopylabs/orpheus-v1-english', + text: 'Hello, world!', + voice: 'autumn', + format: 'wav', + speed: 1, + }) + + expect(result).toBeDefined() + expect(result.model).toBe('canopylabs/orpheus-v1-english') + expect(result.format).toBe('wav') + expect(result.contentType).toBe('audio/wav') + expect(result.audio).toBeDefined() + expect(result.id).toMatch(/^groq-/) + }) + + it('passes correct parameters to the SDK', async () => { + mockSpeechCreate = vi + .fn() + .mockResolvedValueOnce(createMockAudioResponse()) + + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + await adapter.generateSpeech({ + model: 'canopylabs/orpheus-v1-english', + text: 'Test speech', + voice: 'daniel', + format: 'wav', + speed: 1.5, + modelOptions: { + sample_rate: 24000, + }, + }) + + expect(mockSpeechCreate).toHaveBeenCalledTimes(1) + const [params] = mockSpeechCreate.mock.calls[0] as [ + Record, + ] + + expect(params).toMatchObject({ + model: 'canopylabs/orpheus-v1-english', + input: 'Test speech', + voice: 'daniel', + response_format: 'wav', + speed: 1.5, + sample_rate: 24000, + }) + }) + + it('defaults to wav format when no format is specified', async () => { + mockSpeechCreate = vi + .fn() + .mockResolvedValueOnce(createMockAudioResponse()) + + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + const result = await adapter.generateSpeech({ + model: 'canopylabs/orpheus-v1-english', + text: 'Hello!', + }) + + expect(result.format).toBe('wav') + expect(result.contentType).toBe('audio/wav') + }) + + it('defaults to autumn voice when no voice is specified', async () => { + mockSpeechCreate = vi + .fn() + .mockResolvedValueOnce(createMockAudioResponse()) + + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + await adapter.generateSpeech({ + model: 'canopylabs/orpheus-v1-english', + text: 'Hello!', + }) + + const [params] = mockSpeechCreate.mock.calls[0] as [ + Record, + ] + expect(params.voice).toBe('autumn') + }) + + it('throws error when input exceeds 200 characters', async () => { + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + const longText = 'a'.repeat(201) + + await expect( + adapter.generateSpeech({ + model: 'canopylabs/orpheus-v1-english', + text: longText, + }), + ).rejects.toThrow('Input text exceeds maximum length of 200 characters.') + }) + + it('returns correct content type for different formats', async () => { + const formatContentTypes: Array<[string, string]> = [ + ['mp3', 'audio/mpeg'], + ['flac', 'audio/flac'], + ['ogg', 'audio/ogg'], + ['wav', 'audio/wav'], + ] + + for (const [format, expectedContentType] of formatContentTypes) { + mockSpeechCreate = vi + .fn() + .mockResolvedValueOnce(createMockAudioResponse()) + + const adapter = createGroqSpeech( + 'canopylabs/orpheus-v1-english', + 'test-api-key', + ) + + const result = await adapter.generateSpeech({ + model: 'canopylabs/orpheus-v1-english', + text: 'Test', + format: format as 'mp3' | 'flac' | 'wav', + }) + + expect(result.contentType).toBe(expectedContentType) + } + }) + + it('works with Arabic model and voices', async () => { + mockSpeechCreate = vi + .fn() + .mockResolvedValueOnce(createMockAudioResponse()) + + const adapter = createGroqSpeech( + 'canopylabs/orpheus-arabic-saudi', + 'test-api-key', + ) + + const result = await adapter.generateSpeech({ + model: 'canopylabs/orpheus-arabic-saudi', + text: 'مرحبا', + voice: 'fahad', + format: 'wav', + }) + + expect(result).toBeDefined() + expect(result.model).toBe('canopylabs/orpheus-arabic-saudi') + + const [params] = mockSpeechCreate.mock.calls[0] as [ + Record, + ] + expect(params.voice).toBe('fahad') + }) + }) +})