Skip to content

Commit 5ad269b

Browse files
committed
fix(execution): fix isolated-vm memory leak and add worker recycling
1 parent 1acafe8 commit 5ad269b

File tree

3 files changed

+80
-13
lines changed

3 files changed

+80
-13
lines changed

apps/sim/lib/core/config/env.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ export const env = createEnv({
235235
IVM_DISTRIBUTED_MAX_INFLIGHT_PER_OWNER:z.string().optional().default('2200'), // Max owner in-flight leases across replicas
236236
IVM_DISTRIBUTED_LEASE_MIN_TTL_MS: z.string().optional().default('120000'), // Min TTL for distributed in-flight leases (ms)
237237
IVM_QUEUE_TIMEOUT_MS: z.string().optional().default('300000'), // Max queue wait before rejection (ms)
238+
IVM_MAX_EXECUTIONS_PER_WORKER: z.string().optional().default('500'), // Max lifetime executions before worker is recycled
238239

239240
// Knowledge Base Processing Configuration - Shared across all processing methods
240241
KB_CONFIG_MAX_DURATION: z.number().optional().default(600), // Max processing duration in seconds (10 minutes)

apps/sim/lib/execution/isolated-vm-worker.cjs

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,39 +142,58 @@ async function executeCode(request) {
142142
stdoutTruncated = true
143143
}
144144

145+
// Hoist all ivm handle declarations so finally can release them deterministically.
146+
// Per isolated-vm upstream issues #198 and #377: child handles (scripts, callbacks,
147+
// references, external copies) must be released before isolate.dispose() to avoid
148+
// stuck-GC states and native memory leaks outside the V8 heap.
149+
let context = null
150+
let bootstrapScript = null
151+
let userScript = null
152+
let logCallback = null
153+
let errorCallback = null
154+
let fetchCallback = null
155+
const externalCopies = []
156+
145157
try {
146158
isolate = new ivm.Isolate({ memoryLimit: 128 })
147-
const context = await isolate.createContext()
159+
context = await isolate.createContext()
148160
const jail = context.global
149161

150162
await jail.set('global', jail.derefInto())
151163

152-
const logCallback = new ivm.Callback((...args) => {
164+
logCallback = new ivm.Callback((...args) => {
153165
const message = args.map((arg) => stringifyLogValue(arg)).join(' ')
154166
appendStdout(`${message}\n`)
155167
})
156168
await jail.set('__log', logCallback)
157169

158-
const errorCallback = new ivm.Callback((...args) => {
170+
errorCallback = new ivm.Callback((...args) => {
159171
const message = args.map((arg) => stringifyLogValue(arg)).join(' ')
160172
appendStdout(`ERROR: ${message}\n`)
161173
})
162174
await jail.set('__error', errorCallback)
163175

164-
await jail.set('params', new ivm.ExternalCopy(params).copyInto())
165-
await jail.set('environmentVariables', new ivm.ExternalCopy(envVars).copyInto())
176+
const paramsCopy = new ivm.ExternalCopy(params)
177+
externalCopies.push(paramsCopy)
178+
await jail.set('params', paramsCopy.copyInto())
179+
180+
const envVarsCopy = new ivm.ExternalCopy(envVars)
181+
externalCopies.push(envVarsCopy)
182+
await jail.set('environmentVariables', envVarsCopy.copyInto())
166183

167184
for (const [key, value] of Object.entries(contextVariables)) {
168185
if (value === undefined) {
169186
await jail.set(key, undefined)
170187
} else if (value === null) {
171188
await jail.set(key, null)
172189
} else {
173-
await jail.set(key, new ivm.ExternalCopy(value).copyInto())
190+
const ctxCopy = new ivm.ExternalCopy(value)
191+
externalCopies.push(ctxCopy)
192+
await jail.set(key, ctxCopy.copyInto())
174193
}
175194
}
176195

177-
const fetchCallback = new ivm.Reference(async (url, optionsJson) => {
196+
fetchCallback = new ivm.Reference(async (url, optionsJson) => {
178197
return new Promise((resolve) => {
179198
const fetchId = ++fetchIdCounter
180199
const timeout = setTimeout(() => {
@@ -267,7 +286,7 @@ async function executeCode(request) {
267286
}
268287
`
269288

270-
const bootstrapScript = await isolate.compileScript(bootstrap)
289+
bootstrapScript = await isolate.compileScript(bootstrap)
271290
await bootstrapScript.run(context)
272291

273292
const wrappedCode = `
@@ -290,7 +309,7 @@ async function executeCode(request) {
290309
})()
291310
`
292311

293-
const userScript = await isolate.compileScript(wrappedCode, { filename: 'user-function.js' })
312+
userScript = await isolate.compileScript(wrappedCode, { filename: 'user-function.js' })
294313
const resultJson = await userScript.run(context, { timeout: timeoutMs, promise: true })
295314

296315
let result = null
@@ -357,8 +376,30 @@ async function executeCode(request) {
357376
},
358377
}
359378
} finally {
379+
// Release child handles first (scripts, callbacks, references, external copies),
380+
// then dispose the isolate. Order matters: disposing the isolate while child
381+
// handles still exist can cause stuck-GC states (isolated-vm issue #198).
382+
// .release() is idempotent — safe to call even if the object was never assigned.
383+
const releaseables = [
384+
userScript,
385+
bootstrapScript,
386+
...externalCopies,
387+
fetchCallback,
388+
errorCallback,
389+
logCallback,
390+
context,
391+
]
392+
for (const obj of releaseables) {
393+
if (obj) {
394+
try {
395+
obj.release()
396+
} catch {}
397+
}
398+
}
360399
if (isolate) {
361-
isolate.dispose()
400+
try {
401+
isolate.dispose()
402+
} catch {}
362403
}
363404
}
364405
}

apps/sim/lib/execution/isolated-vm.ts

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ const DISTRIBUTED_MAX_INFLIGHT_PER_OWNER =
7070
Number.parseInt(env.IVM_DISTRIBUTED_MAX_INFLIGHT_PER_OWNER) ||
7171
MAX_ACTIVE_PER_OWNER + MAX_QUEUED_PER_OWNER
7272
const DISTRIBUTED_LEASE_MIN_TTL_MS = Number.parseInt(env.IVM_DISTRIBUTED_LEASE_MIN_TTL_MS) || 120000
73+
const MAX_EXECUTIONS_PER_WORKER = Number.parseInt(env.IVM_MAX_EXECUTIONS_PER_WORKER) || 500
7374
const DISTRIBUTED_KEY_PREFIX = 'ivm:fair:v1:owner'
7475
const LEASE_REDIS_DEADLINE_MS = 200
7576
const QUEUE_RETRY_DELAY_MS = 1000
@@ -89,6 +90,8 @@ interface WorkerInfo {
8990
pendingExecutions: Map<number, PendingExecution>
9091
idleTimeout: ReturnType<typeof setTimeout> | null
9192
id: number
93+
lifetimeExecutions: number
94+
retiring: boolean
9295
}
9396

9497
interface QueuedExecution {
@@ -538,8 +541,20 @@ function handleWorkerMessage(workerId: number, message: unknown) {
538541
owner.activeExecutions = Math.max(0, owner.activeExecutions - 1)
539542
maybeCleanupOwner(owner.ownerKey)
540543
}
544+
workerInfo!.lifetimeExecutions++
545+
if (workerInfo!.lifetimeExecutions >= MAX_EXECUTIONS_PER_WORKER && !workerInfo!.retiring) {
546+
workerInfo!.retiring = true
547+
logger.info('Worker marked for retirement', {
548+
workerId,
549+
lifetimeExecutions: workerInfo!.lifetimeExecutions,
550+
})
551+
}
552+
if (workerInfo!.retiring && workerInfo!.activeExecutions === 0) {
553+
cleanupWorker(workerId)
554+
} else {
555+
resetWorkerIdleTimeout(workerId)
556+
}
541557
pending.resolve(msg.result as IsolatedVMExecutionResult)
542-
resetWorkerIdleTimeout(workerId)
543558
drainQueue()
544559
}
545560
return
@@ -679,6 +694,8 @@ function spawnWorker(): Promise<WorkerInfo> {
679694
pendingExecutions: new Map(),
680695
idleTimeout: null,
681696
id: workerId,
697+
lifetimeExecutions: 0,
698+
retiring: false,
682699
}
683700

684701
workerInfo.readyPromise = new Promise<void>((resolve, reject) => {
@@ -710,7 +727,10 @@ function spawnWorker(): Promise<WorkerInfo> {
710727

711728
import('node:child_process')
712729
.then(({ spawn }) => {
713-
const proc = spawn('node', [workerPath], {
730+
// isolated-vm v6 requires --no-node-snapshot on Node.js 20+.
731+
// Without it, Node's shared V8 snapshot heap is incompatible with isolated-vm
732+
// and causes SIGSEGV on worker startup (isolated-vm issue #377).
733+
const proc = spawn('node', ['--no-node-snapshot', workerPath], {
714734
stdio: ['ignore', 'pipe', 'pipe', 'ipc'],
715735
serialization: 'json',
716736
})
@@ -801,6 +821,7 @@ function selectWorker(): WorkerInfo | null {
801821
let best: WorkerInfo | null = null
802822
for (const w of workers.values()) {
803823
if (!w.ready) continue
824+
if (w.retiring) continue
804825
if (w.activeExecutions >= MAX_PER_WORKER) continue
805826
if (!best || w.activeExecutions < best.activeExecutions) {
806827
best = w
@@ -855,7 +876,11 @@ function dispatchToWorker(
855876
stdout: '',
856877
error: { message: `Execution timed out after ${req.timeoutMs}ms`, name: 'TimeoutError' },
857878
})
858-
resetWorkerIdleTimeout(workerInfo.id)
879+
if (workerInfo.retiring && workerInfo.activeExecutions === 0) {
880+
cleanupWorker(workerInfo.id)
881+
} else {
882+
resetWorkerIdleTimeout(workerInfo.id)
883+
}
859884
drainQueue()
860885
}, req.timeoutMs + 1000)
861886

0 commit comments

Comments
 (0)