From 08c64eb947bf0f9320d9b3a5381042572bbda8e8 Mon Sep 17 00:00:00 2001 From: waleed Date: Mon, 18 May 2026 10:38:42 -0700 Subject: [PATCH 1/2] fix(memory): prune toolSchemaCache and semaphores to prevent heap growth toolSchemaCache (lib/copilot/chat/payload.ts): module-level Map keyed by userId:workspaceId never deleted expired entries, only checked TTL on read. With 100K+ unique user/workspace pairs each holding 50-200KB of tool schemas, this was the primary driver of the 24MB -> 25GB heap growth observed in CloudWatch. Add a setInterval sweep every 30s (matching the TTL) with .unref() so it does not prevent graceful shutdown. semaphores (lib/core/async-jobs/backends/database.ts): acquireSlot created Semaphore entries that releaseSlot never deleted. With per-execution UUID keys (e.g. scheduleJobId), each scheduled workflow run would add a permanent entry. Store the concurrency limit on the Semaphore struct and delete the entry from the Map when all slots are free and no waiters remain. validatorCache (lib/copilot/tools/server/generated-schema.ts): validated as bounded (93 tools x 2 schema kinds = 186 max entries, ~2-9MB). No fix needed. isolated-vm nativeContexts: validated as deferred GC, self-healed by worker rotation at MAX_EXECUTIONS_PER_WORKER=200. externalMB spikes trace to concurrent isolate heaps at peak load (128MB limit x active isolates), not a reference leak. No fix needed. --- apps/sim/lib/copilot/chat/payload.ts | 7 +++++++ apps/sim/lib/core/async-jobs/backends/database.ts | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/apps/sim/lib/copilot/chat/payload.ts b/apps/sim/lib/copilot/chat/payload.ts index 1c882f3d4d1..8190360e288 100644 --- a/apps/sim/lib/copilot/chat/payload.ts +++ b/apps/sim/lib/copilot/chat/payload.ts @@ -21,6 +21,13 @@ type ToolSchemaCacheEntry = { const toolSchemaCache = new Map() +setInterval(() => { + const now = Date.now() + for (const [key, entry] of toolSchemaCache) { + if (entry.expiresAt <= now) toolSchemaCache.delete(key) + } +}, TOOL_SCHEMA_CACHE_TTL_MS).unref() + interface BuildPayloadParams { message: string workflowId?: string diff --git a/apps/sim/lib/core/async-jobs/backends/database.ts b/apps/sim/lib/core/async-jobs/backends/database.ts index afaa44a3641..edbcd3d8b4a 100644 --- a/apps/sim/lib/core/async-jobs/backends/database.ts +++ b/apps/sim/lib/core/async-jobs/backends/database.ts @@ -38,6 +38,7 @@ function rowToJob(row: AsyncJobRow): Job { const inlineAbortControllers = new Map() interface Semaphore { + limit: number available: number waiters: Array<() => void> } @@ -46,7 +47,7 @@ const semaphores = new Map() async function acquireSlot(key: string, limit: number): Promise { let s = semaphores.get(key) if (!s) { - s = { available: limit, waiters: [] } + s = { limit, available: limit, waiters: [] } semaphores.set(key, s) } if (s.available > 0) { @@ -65,6 +66,9 @@ function releaseSlot(key: string): void { return } s.available += 1 + if (s.waiters.length === 0 && s.available === s.limit) { + semaphores.delete(key) + } } export class DatabaseJobQueue implements JobQueueBackend { From 6766bd2abbb8f03246613bd59120e3eaf3871051 Mon Sep 17 00:00:00 2001 From: waleed Date: Mon, 18 May 2026 10:53:23 -0700 Subject: [PATCH 2/2] fix(memory): prune effectiveEnvCache and instrument cache sizes in telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit effectiveEnvCache (lib/environment/utils.ts): same unbounded accumulation pattern as toolSchemaCache — module-level Map keyed by userId:workspaceId with a 15s TTL that is only checked on read, never proactively evicted. Adds a periodic sweep matching the TTL interval with .unref(). cache-registry (lib/monitoring/cache-registry.ts): lightweight registry so modules can expose their cache sizes to telemetry without coupling. toolSchemaCache and effectiveEnvCache both register on module load. memory-telemetry: emits cacheSizes in every Memory snapshot log so CloudWatch can confirm the caches stay bounded post-deploy. --- apps/sim/lib/copilot/chat/payload.ts | 3 +++ apps/sim/lib/environment/utils.ts | 10 ++++++++++ apps/sim/lib/monitoring/cache-registry.ts | 13 +++++++++++++ apps/sim/lib/monitoring/memory-telemetry.ts | 2 ++ 4 files changed, 28 insertions(+) create mode 100644 apps/sim/lib/monitoring/cache-registry.ts diff --git a/apps/sim/lib/copilot/chat/payload.ts b/apps/sim/lib/copilot/chat/payload.ts index 8190360e288..691d30ecb06 100644 --- a/apps/sim/lib/copilot/chat/payload.ts +++ b/apps/sim/lib/copilot/chat/payload.ts @@ -5,6 +5,7 @@ import { isPaid } from '@/lib/billing/plan-helpers' import { getToolEntry } from '@/lib/copilot/tool-executor/router' import { getCopilotToolDescription } from '@/lib/copilot/tools/descriptions' import { isHosted } from '@/lib/core/config/feature-flags' +import { registerCache } from '@/lib/monitoring/cache-registry' import { buildMothershipToolsForRequest } from '@/lib/mothership/settings/runtime' import { trackChatUpload } from '@/lib/uploads/contexts/workspace/workspace-file-manager' import { tools } from '@/tools/registry' @@ -28,6 +29,8 @@ setInterval(() => { } }, TOOL_SCHEMA_CACHE_TTL_MS).unref() +registerCache('toolSchemaCache', () => toolSchemaCache.size) + interface BuildPayloadParams { message: string workflowId?: string diff --git a/apps/sim/lib/environment/utils.ts b/apps/sim/lib/environment/utils.ts index 509dcfd7667..e3e4c38753d 100644 --- a/apps/sim/lib/environment/utils.ts +++ b/apps/sim/lib/environment/utils.ts @@ -10,6 +10,7 @@ import { getAccessibleEnvCredentials, syncPersonalEnvCredentialsForUser, } from '@/lib/credentials/environment' +import { registerCache } from '@/lib/monitoring/cache-registry' import { checkWorkspaceAccess } from '@/lib/workspaces/permissions/utils' const logger = createLogger('EnvironmentUtils') @@ -23,6 +24,15 @@ type EffectiveEnvCacheEntry = { const effectiveEnvCache = new Map() +setInterval(() => { + const now = Date.now() + for (const [key, entry] of effectiveEnvCache) { + if (!entry.promise && entry.expiresAt <= now) effectiveEnvCache.delete(key) + } +}, EFFECTIVE_ENV_CACHE_TTL_MS).unref() + +registerCache('effectiveEnvCache', () => effectiveEnvCache.size) + function getEffectiveEnvCacheKey(userId: string, workspaceId?: string) { return `${userId}:${workspaceId ?? ''}` } diff --git a/apps/sim/lib/monitoring/cache-registry.ts b/apps/sim/lib/monitoring/cache-registry.ts new file mode 100644 index 00000000000..19b8753b444 --- /dev/null +++ b/apps/sim/lib/monitoring/cache-registry.ts @@ -0,0 +1,13 @@ +const registry = new Map number>() + +export function registerCache(name: string, getSize: () => number): void { + registry.set(name, getSize) +} + +export function getCacheSizes(): Record { + const sizes: Record = {} + for (const [name, getSize] of registry) { + sizes[name] = getSize() + } + return sizes +} diff --git a/apps/sim/lib/monitoring/memory-telemetry.ts b/apps/sim/lib/monitoring/memory-telemetry.ts index 2845ee1def2..f80a0189dce 100644 --- a/apps/sim/lib/monitoring/memory-telemetry.ts +++ b/apps/sim/lib/monitoring/memory-telemetry.ts @@ -5,6 +5,7 @@ import v8 from 'node:v8' import { createLogger } from '@sim/logger' +import { getCacheSizes } from '@/lib/monitoring/cache-registry' const logger = createLogger('MemoryTelemetry', { logLevel: 'INFO' }) @@ -33,6 +34,7 @@ export function startMemoryTelemetry(intervalMs = 60_000) { ? process.getActiveResourcesInfo().length : -1, uptimeMin: Math.round(process.uptime() / 60), + cacheSizes: getCacheSizes(), }) }, intervalMs) timer.unref()