diff --git a/apps/channels/discord/README.md b/apps/channels/discord/README.md index 3e3fa120..c4061f2c 100644 --- a/apps/channels/discord/README.md +++ b/apps/channels/discord/README.md @@ -11,6 +11,7 @@ - sender identity extraction from Discord user IDs and display names - optional `allowed_channel_ids` allow-list - outbound delivery for `session_send` +- **media**: inbound file/image attachments are uploaded to the agent (images become vision input) and audio attachments are transcribed via STT; the agent's `attachment_send` deliveries are sent back as Discord file uploads. Attachments over the 25 MiB cap are skipped. ## Configuration diff --git a/apps/channels/discord/package.json b/apps/channels/discord/package.json index e6b207c2..038eedee 100644 --- a/apps/channels/discord/package.json +++ b/apps/channels/discord/package.json @@ -1,12 +1,13 @@ { "name": "@openhermit/channel-discord", "private": true, - "version": "0.2.0", + "version": "0.3.0", "type": "module", "main": "src/index.ts", "scripts": { "build": "tsc -b", "typecheck": "tsc -p tsconfig.typecheck.json --pretty false", + "test": "node --import tsx --test test/*.test.ts", "dev": "tsx src/index.ts" }, "dependencies": { diff --git a/apps/channels/discord/src/bot.ts b/apps/channels/discord/src/bot.ts index 59d98c90..09558f7a 100644 --- a/apps/channels/discord/src/bot.ts +++ b/apps/channels/discord/src/bot.ts @@ -1,6 +1,6 @@ import { ChannelType, Events, type Message } from 'discord.js'; -import type { DiscordApi, DiscordMessageEvent } from './discord-api.js'; +import type { DiscordApi, DiscordIncomingAttachment, DiscordMessageEvent } from './discord-api.js'; import type { DiscordBridge } from './bridge.js'; export interface BotOptions { @@ -32,18 +32,22 @@ export class DiscordBot { // Partials.Channel — handle them via the raw gateway dispatch instead. this.discord.client.on('raw' as any, (packet: any) => { if (packet.t !== 'MESSAGE_CREATE') return; - const { guild_id: guildId, author, content, channel_id: channelId, id: messageId } = packet.d ?? {}; - if (guildId || !author || author.bot || !content) return; + const { guild_id: guildId, author, content, channel_id: channelId, id: messageId, attachments } = packet.d ?? {}; + if (guildId || !author || author.bot) return; + const mapped = mapRawAttachments(attachments); + // Allow media-only DMs (no text) through when files are attached. + if (!content && mapped.length === 0) return; const event: DiscordMessageEvent = { channelId, userId: author.id, username: author.username, displayName: author.global_name ?? author.username, - text: content, + text: content ?? '', messageId, isDm: true, mentioned: true, + ...(mapped.length > 0 ? { attachments: mapped } : {}), }; void this.bridge.handleMessage(event).catch((err: Error) => { this.log(`error handling DM: ${err.message}`); @@ -68,11 +72,14 @@ export class DiscordBot { } } if (message.author.bot) return; - if (!message.content) return; // DMs are handled via the raw gateway dispatch above. if (message.channel.type === ChannelType.DM) return; + const mapped = mapMessageAttachments(message); + // Allow media-only messages (no text) through when files are attached. + if (!message.content && mapped.length === 0) return; + const mentioned = this.isMentioned(message); const text = this.stripMention(message.content); @@ -96,6 +103,7 @@ export class DiscordBot { isDm: false, mentioned, ...(message.guildId ? { guildId: message.guildId } : {}), + ...(mapped.length > 0 ? { attachments: mapped } : {}), }; try { @@ -111,6 +119,8 @@ export class DiscordBot { } } + // (helpers below the class) + private isMentioned(message: Message): boolean { const botId = this.discord.botUserId; if (!botId) return false; @@ -123,3 +133,33 @@ export class DiscordBot { return text.replace(new RegExp(`<@!?${botId}>\\s*`, 'g'), '').trim(); } } + +/** Map discord.js Message attachments to the channel-neutral shape. */ +export function mapMessageAttachments(message: Message): DiscordIncomingAttachment[] { + const out: DiscordIncomingAttachment[] = []; + for (const att of message.attachments.values()) { + out.push({ + url: att.url, + name: att.name ?? 'attachment', + ...(att.contentType ? { contentType: att.contentType } : {}), + ...(typeof att.size === 'number' ? { size: att.size } : {}), + }); + } + return out; +} + +/** Map raw gateway dispatch attachments (snake_case) to the neutral shape. */ +export function mapRawAttachments(attachments: unknown): DiscordIncomingAttachment[] { + if (!Array.isArray(attachments)) return []; + const out: DiscordIncomingAttachment[] = []; + for (const att of attachments) { + if (!att || typeof att.url !== 'string') continue; + out.push({ + url: att.url, + name: typeof att.filename === 'string' ? att.filename : 'attachment', + ...(typeof att.content_type === 'string' ? { contentType: att.content_type } : {}), + ...(typeof att.size === 'number' ? { size: att.size } : {}), + }); + } + return out; +} diff --git a/apps/channels/discord/src/bridge.ts b/apps/channels/discord/src/bridge.ts index aa8d2ae6..7ac8393b 100644 --- a/apps/channels/discord/src/bridge.ts +++ b/apps/channels/discord/src/bridge.ts @@ -7,11 +7,23 @@ import { stripSilenceTokens } from '@openhermit/shared'; import type { DiscordApi, DiscordMessageEvent } from './discord-api.js'; import { formatAgentResponse, markdownToDiscord } from './formatting.js'; +/** Gateway-enforced attachment cap (25 MiB). Skip oversized media early. */ +const MAX_MEDIA_BYTES = 25 * 1024 * 1024; + +/** Bound CDN attachment fetches so a stalled connection can't block the queue. */ +const MEDIA_FETCH_TIMEOUT_MS = 15_000; + interface TurnResult { text: string | undefined; error: string | undefined; } +/** Outcome of resolving an inbound message's attachments. */ +interface ResolvedInbound { + text: string; + attachments?: { type: 'file'; id: string }[]; +} + export class DiscordBridge implements ChannelOutbound { readonly channel = 'discord'; @@ -80,12 +92,67 @@ export class DiscordBridge implements ChannelOutbound { async handleMessage(event: DiscordMessageEvent): Promise { const text = event.text.trim(); - if (!text) return; + if (!text && !(event.attachments && event.attachments.length > 0)) return; const sessionId = await this.getSessionId(event.channelId); await this.runInChannelQueue(event.channelId, () => this.sendToAgent(event, sessionId, text)); } + /** + * Fetch each inbound attachment from the Discord CDN and either transcribe + * it (audio) or upload it as a durable session attachment (everything else). + */ + private async resolveInbound( + sessionId: string, + event: DiscordMessageEvent, + baseText: string, + ): Promise { + let text = baseText; + const ids: { type: 'file'; id: string }[] = []; + const transcripts: string[] = []; + + for (const att of event.attachments ?? []) { + if (att.size && att.size > MAX_MEDIA_BYTES) { + this.log(`skipping oversized attachment ${att.name} (${att.size} bytes)`); + continue; + } + let bytes: Uint8Array; + try { + // Bound the CDN fetch so a stalled connection can't block the queue. + const res = await fetch(att.url, { signal: AbortSignal.timeout(MEDIA_FETCH_TIMEOUT_MS) }); + if (!res.ok) throw new Error(`status ${res.status}`); + bytes = new Uint8Array(await res.arrayBuffer()); + } catch (err) { + this.log(`failed to fetch attachment ${att.name}: ${err instanceof Error ? err.message : String(err)}`); + continue; + } + const mime = att.contentType ?? 'application/octet-stream'; + if (mime.startsWith('audio/')) { + try { + const { text: transcript } = await this.client.transcribeAudio({ bytes, mimeType: mime }); + if (transcript.trim()) transcripts.push(transcript.trim()); + } catch (err) { + this.log(`stt failed for ${att.name}: ${err instanceof Error ? err.message : String(err)}`); + } + } else { + try { + const blob = new Blob([bytes as unknown as BlobPart], { type: mime }); + const uploaded = await this.client.uploadAttachment(sessionId, blob, att.name); + ids.push({ type: 'file', id: uploaded.id! }); + } catch (err) { + this.log(`upload failed for ${att.name}: ${err instanceof Error ? err.message : String(err)}`); + } + } + } + + if (transcripts.length > 0) { + const joined = transcripts.join('\n\n'); + text = text ? `${text}\n\n[Transcribed voice message]\n${joined}` : `[Transcribed voice message]\n${joined}`; + } + + return { text, ...(ids.length > 0 ? { attachments: ids } : {}) }; + } + private async runInChannelQueue(channelId: string, task: () => Promise): Promise { const previousTurn = this.turnQueues.get(channelId); const currentTurn = this.runAfterPreviousTurn(previousTurn, task).finally(() => { @@ -135,9 +202,14 @@ export class DiscordBridge implements ChannelOutbound { ): Promise { await this.ensureSession(sessionId, event); + const resolved = await this.resolveInbound(sessionId, event, text); + // Nothing usable (e.g. all attachments failed to fetch and no text). + if (!resolved.text && !resolved.attachments) return; + const postResult = await this.client.postMessage(sessionId, { - text, + text: resolved.text, mentioned: event.mentioned, + ...(resolved.attachments ? { attachments: resolved.attachments } : {}), sender: { channel: 'discord', channelUserId: event.userId, @@ -158,6 +230,35 @@ export class DiscordBridge implements ChannelOutbound { } } + /** + * Deliver an outbound `attachment` SSE event as a Discord file upload. + * Bytes are pulled lazily from the agent-local API. + */ + private async deliverAttachment( + channelId: string, + payload: Record, + ): Promise { + const sessionId = String(payload.sessionId ?? ''); + const attachmentId = String(payload.attachmentId ?? ''); + if (!sessionId || !attachmentId) { + this.log('attachment event missing sessionId/attachmentId'); + return; + } + const caption = + typeof payload.caption === 'string' && payload.caption.length > 0 + ? payload.caption + : undefined; + const hintedName = + typeof payload.name === 'string' && payload.name.length > 0 ? payload.name : undefined; + + const { bytes, filename } = await this.client.downloadAttachmentBytes(sessionId, attachmentId); + await this.discord.sendFile(channelId, { + bytes, + filename: hintedName ?? filename ?? 'attachment', + ...(caption ? { caption } : {}), + }); + } + private async ensureSession( sessionId: string, event: DiscordMessageEvent, @@ -283,6 +384,17 @@ export class DiscordBridge implements ChannelOutbound { continue; } + if (frame.event === 'attachment') { + try { + await this.deliverAttachment(channelId, payload); + } catch (err) { + this.log( + `attachment delivery failed: ${err instanceof Error ? err.message : String(err)}`, + ); + } + continue; + } + if (frame.event === 'agent_end') { sawAgentEnd = true; continue; diff --git a/apps/channels/discord/src/discord-api.ts b/apps/channels/discord/src/discord-api.ts index 850c4cb1..06f935b9 100644 --- a/apps/channels/discord/src/discord-api.ts +++ b/apps/channels/discord/src/discord-api.ts @@ -1,10 +1,19 @@ import { + AttachmentBuilder, Client, GatewayIntentBits, Partials, type Message, } from 'discord.js'; +/** An inbound file attached to a Discord message (CDN-hosted). */ +export interface DiscordIncomingAttachment { + url: string; + name: string; + contentType?: string; + size?: number; +} + export interface DiscordMessageEvent { channelId: string; userId: string; @@ -15,6 +24,7 @@ export interface DiscordMessageEvent { isDm: boolean; mentioned: boolean; guildId?: string; + attachments?: DiscordIncomingAttachment[]; } export class DiscordApi { @@ -62,6 +72,22 @@ export class DiscordApi { return (channel as any).send(text) as Promise; } + /** Send a file attachment, optionally with caption text in the same message. */ + async sendFile( + channelId: string, + file: { bytes: Uint8Array; filename: string; caption?: string }, + ): Promise { + const channel = await this.client.channels.fetch(channelId); + if (!channel || !('send' in channel)) { + throw new Error(`Channel ${channelId} not found or not text-based`); + } + const attachment = new AttachmentBuilder(Buffer.from(file.bytes), { name: file.filename }); + const payload: { files: AttachmentBuilder[]; content?: string } = { files: [attachment] }; + if (file.caption && file.caption.length > 0) payload.content = file.caption; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return (channel as any).send(payload) as Promise; + } + async editMessage(channelId: string, messageId: string, text: string): Promise { const channel = await this.client.channels.fetch(channelId); if (!channel || !('messages' in channel)) return; diff --git a/apps/channels/discord/test/bot.test.ts b/apps/channels/discord/test/bot.test.ts new file mode 100644 index 00000000..eefc9227 --- /dev/null +++ b/apps/channels/discord/test/bot.test.ts @@ -0,0 +1,25 @@ +import assert from 'node:assert/strict'; +import { test } from 'node:test'; + +import { mapRawAttachments } from '../src/bot.js'; + +test('mapRawAttachments maps gateway-dispatch attachments to the neutral shape', () => { + const mapped = mapRawAttachments([ + { url: 'https://cdn.discordapp.com/a.png', filename: 'a.png', content_type: 'image/png', size: 1234 }, + { url: 'https://cdn.discordapp.com/b.pdf', filename: 'b.pdf', content_type: 'application/pdf', size: 99 }, + ]); + assert.deepEqual(mapped, [ + { url: 'https://cdn.discordapp.com/a.png', name: 'a.png', contentType: 'image/png', size: 1234 }, + { url: 'https://cdn.discordapp.com/b.pdf', name: 'b.pdf', contentType: 'application/pdf', size: 99 }, + ]); +}); + +test('mapRawAttachments tolerates missing fields and non-arrays', () => { + assert.deepEqual(mapRawAttachments(undefined), []); + assert.deepEqual(mapRawAttachments('nope'), []); + assert.deepEqual(mapRawAttachments([{ url: 'https://x/y' }]), [ + { url: 'https://x/y', name: 'attachment' }, + ]); + // Entries without a url are dropped. + assert.deepEqual(mapRawAttachments([{ filename: 'no-url.txt' }]), []); +}); diff --git a/apps/gateway/package.json b/apps/gateway/package.json index 7ed9db1d..b8d227d4 100644 --- a/apps/gateway/package.json +++ b/apps/gateway/package.json @@ -39,6 +39,6 @@ "optionalDependencies": { "@openhermit/channel-telegram": "0.2.0", "@openhermit/channel-slack": "0.2.0", - "@openhermit/channel-discord": "0.2.0" + "@openhermit/channel-discord": "0.3.0" } } diff --git a/docs/channel-adapter.md b/docs/channel-adapter.md index 6f81a910..4ebf3365 100644 --- a/docs/channel-adapter.md +++ b/docs/channel-adapter.md @@ -11,7 +11,7 @@ These ship inside the CLI binary and are registered automatically: | Platform | Package | Connection | |----------|---------|------------| | Telegram | `@openhermit/channel-telegram` | polling or webhook | -| Discord | `@openhermit/channel-discord` | Discord gateway via `discord.js` | +| Discord | `@openhermit/channel-discord` | Discord gateway via `discord.js`; text + media (files/images, audio transcribed) | | Slack | `@openhermit/channel-slack` | Slack Socket Mode | ## External Plugin Adapters @@ -166,6 +166,7 @@ Discord: - guild messages and DMs - mention detection before routing - optional `allowed_channel_ids` +- media inbound (CDN attachments uploaded as session attachments; images become vision input; audio transcribed via STT) and outbound (`attachment_send` → Discord file upload); media over the 25 MiB cap is skipped Slack: diff --git a/docs/manual/17-channels.md b/docs/manual/17-channels.md index 130b343b..2a339979 100644 --- a/docs/manual/17-channels.md +++ b/docs/manual/17-channels.md @@ -61,6 +61,7 @@ A gateway restart (`hermit gateway stop && hermit gateway start`) is required fo - Bot applications are per agent. - Slash commands optional; the agent works in plain channel chat once invited. +- Media: file/image attachments are uploaded to the agent (images become vision input) and audio attachments are transcribed; the agent can send files back. Attachments over 25 MiB are skipped. ### Slack diff --git a/package-lock.json b/package-lock.json index 4e37a0dd..68678d30 100644 --- a/package-lock.json +++ b/package-lock.json @@ -53,7 +53,7 @@ }, "apps/channels/discord": { "name": "@openhermit/channel-discord", - "version": "0.2.0", + "version": "0.3.0", "dependencies": { "@openhermit/sdk": "^0.6.0", "@openhermit/shared": "0.2.0", @@ -324,7 +324,7 @@ "vite": "^6.3.2" }, "optionalDependencies": { - "@openhermit/channel-discord": "0.2.0", + "@openhermit/channel-discord": "0.3.0", "@openhermit/channel-slack": "0.2.0", "@openhermit/channel-telegram": "0.2.0" }