diff --git a/apps/sim/app/api/resume/[workflowId]/[executionId]/[contextId]/route.ts b/apps/sim/app/api/resume/[workflowId]/[executionId]/[contextId]/route.ts index 7f9ed06a1a4..a9bab734a0f 100644 --- a/apps/sim/app/api/resume/[workflowId]/[executionId]/[contextId]/route.ts +++ b/apps/sim/app/api/resume/[workflowId]/[executionId]/[contextId]/route.ts @@ -10,7 +10,6 @@ import { generateRequestId } from '@/lib/core/utils/request' import { SSE_HEADERS } from '@/lib/core/utils/sse' import { getBaseUrl } from '@/lib/core/utils/urls' import { withRouteHandler } from '@/lib/core/utils/with-route-handler' -import { setExecutionMeta } from '@/lib/execution/event-buffer' import { preprocessExecution } from '@/lib/execution/preprocessing' import { PauseResumeManager } from '@/lib/workflows/executor/human-in-the-loop-manager' import { createStreamingResponse } from '@/lib/workflows/streaming/streaming' @@ -157,12 +156,6 @@ export const POST = withRouteHandler( }) } - await setExecutionMeta(enqueueResult.resumeExecutionId, { - status: 'active', - userId, - workflowId, - }) - const resumeArgs = { resumeEntryId: enqueueResult.resumeEntryId, resumeExecutionId: enqueueResult.resumeExecutionId, @@ -249,6 +242,14 @@ export const POST = withRouteHandler( error: toError(dispatchError).message, resumeExecutionId: enqueueResult.resumeExecutionId, }) + await PauseResumeManager.markResumeAttemptFailed({ + resumeEntryId: enqueueResult.resumeEntryId, + pausedExecutionId: enqueueResult.pausedExecution.id, + parentExecutionId: executionId, + contextId: enqueueResult.contextId, + failureReason: 'Failed to queue async resume execution', + }) + await PauseResumeManager.processQueuedResumes(executionId) return NextResponse.json( { error: 'Failed to queue resume execution. Please try again.' }, { status: 503 } diff --git a/apps/sim/app/api/workflows/[id]/execute/route.ts b/apps/sim/app/api/workflows/[id]/execute/route.ts index 24e9038dd55..b0f0a0b1d4d 100644 --- a/apps/sim/app/api/workflows/[id]/execute/route.ts +++ b/apps/sim/app/api/workflows/[id]/execute/route.ts @@ -25,7 +25,12 @@ import { SIM_VIA_HEADER, validateCallChain, } from '@/lib/execution/call-chain' -import { createExecutionEventWriter, setExecutionMeta } from '@/lib/execution/event-buffer' +import { + createExecutionEventWriter, + flushExecutionStreamReplayBuffer, + initializeExecutionStreamMeta, + type TerminalExecutionStreamStatus, +} from '@/lib/execution/event-buffer' import { processInputFileFields } from '@/lib/execution/files' import { registerManualExecutionAborter, @@ -868,11 +873,17 @@ async function handleExecutePost( let isManualAbortRegistered = false const eventWriter = createExecutionEventWriter(executionId) - setExecutionMeta(executionId, { - status: 'active', + const metaInitialized = await initializeExecutionStreamMeta(executionId, { userId: actorUserId, workflowId, - }).catch(() => {}) + }) + if (!metaInitialized) { + timeoutController.cleanup() + return NextResponse.json( + { error: 'Run buffer temporarily unavailable' }, + { status: 503, headers: { 'X-Execution-Id': executionId } } + ) + } const stream = new ReadableStream({ async start(controller) { @@ -881,12 +892,18 @@ async function handleExecutePost( registerManualExecutionAborter(executionId, timeoutController.abort) isManualAbortRegistered = true - let localEventSeq = 0 - const sendEvent = (event: ExecutionEvent) => { + let terminalEventPublished = false + const sendEvent = async ( + event: ExecutionEvent, + terminalStatus?: TerminalExecutionStreamStatus + ) => { const isBuffered = event.type !== 'stream:chunk' && event.type !== 'stream:done' if (isBuffered) { - localEventSeq++ - event.eventId = localEventSeq + const entry = terminalStatus + ? await eventWriter.writeTerminal(event, terminalStatus) + : await eventWriter.write(event) + event.eventId = entry.eventId + terminalEventPublished ||= Boolean(terminalStatus) } if (!isStreamClosed) { try { @@ -895,15 +912,12 @@ async function handleExecutePost( isStreamClosed = true } } - if (isBuffered) { - eventWriter.write(event).catch(() => {}) - } } try { const startTime = new Date() - sendEvent({ + await sendEvent({ type: 'execution:started', timestamp: startTime.toISOString(), executionId, @@ -922,7 +936,7 @@ async function handleExecutePost( childWorkflowContext?: ChildWorkflowContext ) => { reqLogger.info('onBlockStart called', { blockId, blockName, blockType }) - sendEvent({ + await sendEvent({ type: 'block:started', timestamp: new Date().toISOString(), executionId, @@ -976,7 +990,7 @@ async function handleExecutePost( blockType, error: callbackData.output.error, }) - sendEvent({ + await sendEvent({ type: 'block:error', timestamp: new Date().toISOString(), executionId, @@ -1010,7 +1024,7 @@ async function handleExecutePost( blockName, blockType, }) - sendEvent({ + await sendEvent({ type: 'block:completed', timestamp: new Date().toISOString(), executionId, @@ -1053,7 +1067,7 @@ async function handleExecutePost( if (done) break const chunk = decoder.decode(value, { stream: true }) - sendEvent({ + await sendEvent({ type: 'stream:chunk', timestamp: new Date().toISOString(), executionId, @@ -1062,7 +1076,7 @@ async function handleExecutePost( }) } - sendEvent({ + await sendEvent({ type: 'stream:done', timestamp: new Date().toISOString(), executionId, @@ -1107,13 +1121,14 @@ async function handleExecutePost( selectedOutputs ) - const onChildWorkflowInstanceReady = ( + const onChildWorkflowInstanceReady = async ( blockId: string, childWorkflowInstanceId: string, iterationContext?: IterationContext, - executionOrder?: number + executionOrder?: number, + childWorkflowContext?: ChildWorkflowContext ) => { - sendEvent({ + await sendEvent({ type: 'block:childWorkflowStarted', timestamp: new Date().toISOString(), executionId, @@ -1123,7 +1138,16 @@ async function handleExecutePost( childWorkflowInstanceId, ...(iterationContext && { iterationCurrent: iterationContext.iterationCurrent, + iterationTotal: iterationContext.iterationTotal, + iterationType: iterationContext.iterationType, iterationContainerId: iterationContext.iterationContainerId, + ...(iterationContext.parentIterations?.length && { + parentIterations: iterationContext.parentIterations, + }), + }), + ...(childWorkflowContext && { + childWorkflowBlockId: childWorkflowContext.parentBlockId, + childWorkflowName: childWorkflowContext.workflowName, }), ...(executionOrder !== undefined && { executionOrder }), }, @@ -1157,32 +1181,38 @@ async function handleExecutePost( await loggingSession.markAsFailed(timeoutErrorMessage) - sendEvent({ - type: 'execution:error', - timestamp: new Date().toISOString(), - executionId, - workflowId, - data: { - error: timeoutErrorMessage, - duration: result.metadata?.duration || 0, - finalBlockLogs: result.logs, - }, - }) finalMetaStatus = 'error' + await sendEvent( + { + type: 'execution:error', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + error: timeoutErrorMessage, + duration: result.metadata?.duration || 0, + finalBlockLogs: result.logs, + }, + }, + 'error' + ) } else { reqLogger.info('Workflow execution was cancelled') - sendEvent({ - type: 'execution:cancelled', - timestamp: new Date().toISOString(), - executionId, - workflowId, - data: { - duration: result.metadata?.duration || 0, - finalBlockLogs: result.logs, - }, - }) finalMetaStatus = 'cancelled' + await sendEvent( + { + type: 'execution:cancelled', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + duration: result.metadata?.duration || 0, + finalBlockLogs: result.logs, + }, + }, + 'cancelled' + ) } return } @@ -1196,35 +1226,43 @@ async function handleExecutePost( : result.output if (result.status === 'paused') { - sendEvent({ - type: 'execution:paused', - timestamp: new Date().toISOString(), - executionId, - workflowId, - data: { - output: sseOutput, - duration: result.metadata?.duration || 0, - startTime: result.metadata?.startTime || startTime.toISOString(), - endTime: result.metadata?.endTime || new Date().toISOString(), + finalMetaStatus = 'complete' + await sendEvent( + { + type: 'execution:paused', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + output: sseOutput, + duration: result.metadata?.duration || 0, + startTime: result.metadata?.startTime || startTime.toISOString(), + endTime: result.metadata?.endTime || new Date().toISOString(), + finalBlockLogs: result.logs, + }, }, - }) + 'complete' + ) } else { - sendEvent({ - type: 'execution:completed', - timestamp: new Date().toISOString(), - executionId, - workflowId, - data: { - success: result.success, - output: sseOutput, - duration: result.metadata?.duration || 0, - startTime: result.metadata?.startTime || startTime.toISOString(), - endTime: result.metadata?.endTime || new Date().toISOString(), - finalBlockLogs: result.logs, + finalMetaStatus = 'complete' + await sendEvent( + { + type: 'execution:completed', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + success: result.success, + output: sseOutput, + duration: result.metadata?.duration || 0, + startTime: result.metadata?.startTime || startTime.toISOString(), + endTime: result.metadata?.endTime || new Date().toISOString(), + finalBlockLogs: result.logs, + }, }, - }) + 'complete' + ) } - finalMetaStatus = 'complete' } catch (error: unknown) { const isTimeout = isTimeoutError(error) || timeoutController.isTimedOut() const errorMessage = isTimeout @@ -1237,32 +1275,55 @@ async function handleExecutePost( const executionResult = hasExecutionResult(error) ? error.executionResult : undefined - sendEvent({ - type: 'execution:error', - timestamp: new Date().toISOString(), - executionId, - workflowId, - data: { - error: executionResult?.error || errorMessage, - duration: executionResult?.metadata?.duration || 0, - finalBlockLogs: executionResult?.logs, - }, - }) finalMetaStatus = 'error' + await sendEvent( + { + type: 'execution:error', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + error: executionResult?.error || errorMessage, + duration: executionResult?.metadata?.duration || 0, + finalBlockLogs: executionResult?.logs, + }, + }, + 'error' + ) } finally { if (isManualAbortRegistered) { unregisterManualExecutionAborter(executionId) isManualAbortRegistered = false } - try { - await eventWriter.close() - } catch (closeError) { - reqLogger.warn('Failed to close event writer', { - error: toError(closeError).message, + if (finalMetaStatus && !terminalEventPublished) { + const replayBufferFlushed = await flushExecutionStreamReplayBuffer( + executionId, + eventWriter + ) + reqLogger.error('Failed to publish terminal execution event durably', { + executionId, + status: finalMetaStatus, + replayBufferFlushed, }) - } - if (finalMetaStatus) { - setExecutionMeta(executionId, { status: finalMetaStatus }).catch(() => {}) + if (!isStreamClosed) { + controller.error(new Error('Run buffer terminal event publish failed')) + isStreamClosed = true + } + } else if (terminalEventPublished) { + await eventWriter.close().catch((closeError) => { + reqLogger.warn('Failed to close execution event writer after terminal publish', { + executionId, + error: closeError instanceof Error ? closeError.message : String(closeError), + }) + }) + } else { + try { + await eventWriter.close() + } catch (closeError) { + reqLogger.warn('Failed to close event writer', { + error: toError(closeError).message, + }) + } } timeoutController.cleanup() if (executionId) { diff --git a/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.test.ts b/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.test.ts index c7b86847f0b..6ee6c71aa7d 100644 --- a/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.test.ts +++ b/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.test.ts @@ -15,17 +15,27 @@ import { beforeEach, describe, expect, it, vi } from 'vitest' const { mockMarkExecutionCancelled, mockAbortManualExecution, - mockCancelPausedExecution, - mockSetExecutionMeta, + mockBeginPausedCancellation, + mockBlockQueuedResumesForCancellation, + mockClearPausedCancellationIntent, + mockCompletePausedCancellation, + mockGetPausedCancellationStatus, + mockFinalizeExecutionStream, + mockReadExecutionMetaState, mockWriteEvent, - mockCloseWriter, + mockWriteTerminalEvent, } = vi.hoisted(() => ({ mockMarkExecutionCancelled: vi.fn(), mockAbortManualExecution: vi.fn(), - mockCancelPausedExecution: vi.fn(), - mockSetExecutionMeta: vi.fn(), + mockBeginPausedCancellation: vi.fn(), + mockBlockQueuedResumesForCancellation: vi.fn(), + mockClearPausedCancellationIntent: vi.fn(), + mockCompletePausedCancellation: vi.fn(), + mockGetPausedCancellationStatus: vi.fn(), + mockFinalizeExecutionStream: vi.fn(), + mockReadExecutionMetaState: vi.fn(), mockWriteEvent: vi.fn(), - mockCloseWriter: vi.fn(), + mockWriteTerminalEvent: vi.fn(), })) vi.mock('@/lib/execution/cancellation', () => ({ @@ -38,7 +48,13 @@ vi.mock('@/lib/execution/manual-cancellation', () => ({ vi.mock('@/lib/workflows/executor/human-in-the-loop-manager', () => ({ PauseResumeManager: { - cancelPausedExecution: (...args: unknown[]) => mockCancelPausedExecution(...args), + beginPausedCancellation: (...args: unknown[]) => mockBeginPausedCancellation(...args), + blockQueuedResumesForCancellation: (...args: unknown[]) => + mockBlockQueuedResumesForCancellation(...args), + clearPausedCancellationIntent: (...args: unknown[]) => + mockClearPausedCancellationIntent(...args), + completePausedCancellation: (...args: unknown[]) => mockCompletePausedCancellation(...args), + getPausedCancellationStatus: (...args: unknown[]) => mockGetPausedCancellationStatus(...args), }, })) @@ -47,10 +63,12 @@ vi.mock('@/lib/workflows/utils', () => workflowsUtilsMock) vi.mock('@/lib/posthog/server', () => posthogServerMock) vi.mock('@/lib/execution/event-buffer', () => ({ - setExecutionMeta: (...args: unknown[]) => mockSetExecutionMeta(...args), + finalizeExecutionStream: (...args: unknown[]) => mockFinalizeExecutionStream(...args), + readExecutionMetaState: (...args: unknown[]) => mockReadExecutionMetaState(...args), createExecutionEventWriter: () => ({ write: (...args: unknown[]) => mockWriteEvent(...args), - close: () => mockCloseWriter(), + writeTerminal: (...args: unknown[]) => mockWriteTerminalEvent(...args), + close: vi.fn().mockResolvedValue(undefined), }), })) @@ -71,10 +89,15 @@ describe('POST /api/workflows/[id]/executions/[executionId]/cancel', () => { allowed: true, }) mockAbortManualExecution.mockReturnValue(false) - mockCancelPausedExecution.mockResolvedValue(false) - mockSetExecutionMeta.mockResolvedValue(undefined) + mockBeginPausedCancellation.mockResolvedValue(false) + mockBlockQueuedResumesForCancellation.mockResolvedValue(false) + mockClearPausedCancellationIntent.mockResolvedValue(undefined) + mockCompletePausedCancellation.mockResolvedValue(false) + mockGetPausedCancellationStatus.mockResolvedValue(null) + mockFinalizeExecutionStream.mockResolvedValue(true) + mockReadExecutionMetaState.mockResolvedValue({ status: 'missing' }) mockWriteEvent.mockResolvedValue({ eventId: 1 }) - mockCloseWriter.mockResolvedValue(undefined) + mockWriteTerminalEvent.mockResolvedValue({ eventId: 1 }) }) it('returns success when cancellation was durably recorded', async () => { @@ -159,11 +182,8 @@ describe('POST /api/workflows/[id]/executions/[executionId]/cancel', () => { }) it('returns success when a paused HITL execution is cancelled directly in the database', async () => { - mockMarkExecutionCancelled.mockResolvedValue({ - durablyRecorded: false, - reason: 'redis_unavailable', - }) - mockCancelPausedExecution.mockResolvedValue(true) + mockBeginPausedCancellation.mockResolvedValue(true) + mockCompletePausedCancellation.mockResolvedValue(true) const response = await POST(makeRequest(), makeParams()) @@ -171,12 +191,77 @@ describe('POST /api/workflows/[id]/executions/[executionId]/cancel', () => { await expect(response.json()).resolves.toEqual({ success: true, executionId: 'ex-1', + redisAvailable: true, + durablyRecorded: true, + locallyAborted: false, + pausedCancelled: true, + reason: 'recorded', + }) + expect(mockMarkExecutionCancelled).not.toHaveBeenCalled() + expect(mockWriteTerminalEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: 'execution:cancelled', + executionId: 'ex-1', + workflowId: 'wf-1', + }), + 'cancelled' + ) + expect(mockFinalizeExecutionStream).not.toHaveBeenCalled() + }) + + it('publishes paused cancellation event even when Redis cancellation is recorded', async () => { + mockBeginPausedCancellation.mockResolvedValue(true) + mockCompletePausedCancellation.mockResolvedValue(true) + + const response = await POST(makeRequest(), makeParams()) + + expect(response.status).toBe(200) + await expect(response.json()).resolves.toMatchObject({ + success: true, + executionId: 'ex-1', + durablyRecorded: true, + pausedCancelled: true, + }) + expect(mockMarkExecutionCancelled).not.toHaveBeenCalled() + expect(mockWriteTerminalEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: 'execution:cancelled', + executionId: 'ex-1', + workflowId: 'wf-1', + }), + 'cancelled' + ) + expect(mockFinalizeExecutionStream).not.toHaveBeenCalled() + }) + + it('does not confirm paused cancellation when terminal event publication fails', async () => { + mockBeginPausedCancellation.mockResolvedValue(true) + mockCompletePausedCancellation.mockResolvedValue(true) + mockWriteTerminalEvent.mockRejectedValue(new Error('Redis unavailable')) + + const response = await POST(makeRequest(), makeParams()) + + expect(response.status).toBe(200) + await expect(response.json()).resolves.toEqual({ + success: false, + executionId: 'ex-1', redisAvailable: false, durablyRecorded: false, locallyAborted: false, - pausedCancelled: true, - reason: 'redis_unavailable', + pausedCancelled: false, + reason: 'paused_event_publish_failed', }) + expect(mockMarkExecutionCancelled).not.toHaveBeenCalled() + expect(mockCompletePausedCancellation).not.toHaveBeenCalled() + expect(mockWriteTerminalEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: 'execution:cancelled', + executionId: 'ex-1', + workflowId: 'wf-1', + }), + 'cancelled' + ) + expect(mockFinalizeExecutionStream).not.toHaveBeenCalled() }) it('returns 401 when auth fails', async () => { @@ -241,11 +326,7 @@ describe('POST /api/workflows/[id]/executions/[executionId]/cancel', () => { }) it('does not update execution log status in DB when only paused execution was cancelled', async () => { - mockMarkExecutionCancelled.mockResolvedValue({ - durablyRecorded: false, - reason: 'redis_unavailable', - }) - mockCancelPausedExecution.mockResolvedValue(true) + mockBeginPausedCancellation.mockResolvedValue(true) await POST(makeRequest(), makeParams()) diff --git a/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.ts b/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.ts index 595aef5e4a5..841b92c36fd 100644 --- a/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.ts +++ b/apps/sim/app/api/workflows/[id]/executions/[executionId]/cancel/route.ts @@ -1,6 +1,7 @@ import { db } from '@sim/db' import { workflowExecutionLogs } from '@sim/db/schema' import { createLogger } from '@sim/logger' +import { sleep } from '@sim/utils/helpers' import { authorizeWorkflowByWorkspacePermission } from '@sim/workflow-authz' import { and, eq } from 'drizzle-orm' import { type NextRequest, NextResponse } from 'next/server' @@ -8,13 +9,83 @@ import { cancelWorkflowExecutionContract } from '@/lib/api/contracts/workflows' import { parseRequest } from '@/lib/api/server' import { checkHybridAuth } from '@/lib/auth/hybrid' import { withRouteHandler } from '@/lib/core/utils/with-route-handler' -import { markExecutionCancelled } from '@/lib/execution/cancellation' -import { createExecutionEventWriter, setExecutionMeta } from '@/lib/execution/event-buffer' +import { + type ExecutionCancellationRecordResult, + markExecutionCancelled, +} from '@/lib/execution/cancellation' +import { createExecutionEventWriter, readExecutionMetaState } from '@/lib/execution/event-buffer' import { abortManualExecution } from '@/lib/execution/manual-cancellation' import { captureServerEvent } from '@/lib/posthog/server' import { PauseResumeManager } from '@/lib/workflows/executor/human-in-the-loop-manager' const logger = createLogger('CancelExecutionAPI') +const PAUSED_CANCELLATION_DB_ATTEMPTS = 3 +const PAUSED_CANCELLATION_DB_RETRY_MS = 200 + +async function completePausedCancellationWithRetry(executionId: string): Promise { + for (let attempt = 1; attempt <= PAUSED_CANCELLATION_DB_ATTEMPTS; attempt++) { + try { + const cancelled = await PauseResumeManager.completePausedCancellation(executionId) + if (cancelled) { + logger.info('Paused execution cancelled in database', { executionId, attempt }) + return true + } + logger.warn('Paused execution cancellation could not be completed in database', { + executionId, + attempt, + }) + return false + } catch (error) { + logger.warn('Failed to complete paused execution cancellation in database', { + executionId, + attempt, + error, + }) + if (attempt < PAUSED_CANCELLATION_DB_ATTEMPTS) { + await sleep(PAUSED_CANCELLATION_DB_RETRY_MS) + } + } + } + return false +} + +async function ensurePausedCancellationEventPublished( + executionId: string, + workflowId: string +): Promise { + const metaState = await readExecutionMetaState(executionId) + if (metaState.status === 'found' && metaState.meta.status === 'cancelled') { + return true + } + + const writer = createExecutionEventWriter(executionId) + try { + await writer.writeTerminal( + { + type: 'execution:cancelled', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { duration: 0 }, + }, + 'cancelled' + ) + return true + } catch (error) { + logger.warn('Failed to publish paused execution cancellation event', { + executionId, + error, + }) + return false + } finally { + await writer.close().catch((error) => { + logger.warn('Failed to close paused cancellation event writer', { + executionId, + error, + }) + }) + } +} export const runtime = 'nodejs' export const dynamic = 'force-dynamic' @@ -55,40 +126,102 @@ export const POST = withRouteHandler( logger.info('Cancel execution requested', { workflowId, executionId, userId: auth.userId }) - const cancellation = await markExecutionCancelled(executionId) - const locallyAborted = abortManualExecution(executionId) + let pausedCancellationStarted = false let pausedCancelled = false try { - pausedCancelled = await PauseResumeManager.cancelPausedExecution(executionId) + pausedCancellationStarted = await PauseResumeManager.beginPausedCancellation(executionId) } catch (error) { - logger.warn('Failed to cancel paused execution in database', { executionId, error }) + logger.warn('Failed to begin paused execution cancellation in database', { + executionId, + error, + }) } + const pendingPausedCancellation = pausedCancellationStarted + ? null + : await PauseResumeManager.getPausedCancellationStatus(executionId) + const isPausedCancellationPath = + pausedCancellationStarted || pendingPausedCancellation !== null + + const cancellation: ExecutionCancellationRecordResult = isPausedCancellationPath + ? { durablyRecorded: false, reason: 'redis_unavailable' } + : await markExecutionCancelled(executionId) + const locallyAborted = isPausedCancellationPath ? false : abortManualExecution(executionId) - if (cancellation.durablyRecorded) { + if (pausedCancellationStarted) { + logger.info('Paused execution cancellation reserved in database', { executionId }) + } else if (cancellation.durablyRecorded) { logger.info('Execution marked as cancelled in Redis', { executionId }) } else if (locallyAborted) { logger.info('Execution cancelled via local in-process fallback', { executionId }) - } else if (pausedCancelled) { - logger.info('Paused execution cancelled directly in database', { executionId }) - void setExecutionMeta(executionId, { status: 'cancelled', workflowId }).catch(() => {}) - const writer = createExecutionEventWriter(executionId) - void writer - .write({ - type: 'execution:cancelled', - timestamp: new Date().toISOString(), - executionId, - workflowId, - data: { duration: 0 }, - }) - .then(() => writer.close()) - .catch(() => {}) - } else { + } else if (!pausedCancellationStarted) { logger.warn('Execution cancellation was not durably recorded', { executionId, reason: cancellation.reason, }) } + if (!isPausedCancellationPath && (cancellation.durablyRecorded || locallyAborted)) { + await PauseResumeManager.blockQueuedResumesForCancellation(executionId).catch((error) => { + logger.warn('Failed to block queued paused resumes after cancellation', { + executionId, + error, + }) + }) + } else if (!isPausedCancellationPath) { + await PauseResumeManager.clearPausedCancellationIntent(executionId).catch((error) => { + logger.warn( + 'Failed to clear paused cancellation intent after unsuccessful cancellation', + { + executionId, + error, + } + ) + }) + } + + let pausedCancellationPublished = false + let pausedCancellationPublishFailed = false + if (pausedCancellationStarted) { + pausedCancellationPublished = await ensurePausedCancellationEventPublished( + executionId, + workflowId + ) + pausedCancellationPublishFailed = !pausedCancellationPublished + if (pausedCancellationPublished) { + pausedCancelled = await completePausedCancellationWithRetry(executionId) + } + } else { + if (pendingPausedCancellation === 'cancelled') { + pausedCancellationPublished = await ensurePausedCancellationEventPublished( + executionId, + workflowId + ) + pausedCancellationPublishFailed = !pausedCancellationPublished + pausedCancelled = pausedCancellationPublished + } else if (pendingPausedCancellation === 'cancelling') { + pausedCancellationPublished = await ensurePausedCancellationEventPublished( + executionId, + workflowId + ) + pausedCancellationPublishFailed = !pausedCancellationPublished + if (pausedCancellationPublished) { + pausedCancelled = await completePausedCancellationWithRetry(executionId) + } + } + } + + if ( + pausedCancellationPublishFailed && + (pausedCancellationStarted || pendingPausedCancellation === 'cancelling') + ) { + await PauseResumeManager.clearPausedCancellationIntent(executionId).catch((error) => { + logger.warn('Failed to clear paused cancellation intent after publish failure', { + executionId, + error, + }) + }) + } + if ((cancellation.durablyRecorded || locallyAborted) && !pausedCancelled) { try { await db @@ -108,7 +241,10 @@ export const POST = withRouteHandler( } } - const success = cancellation.durablyRecorded || locallyAborted || pausedCancelled + const success = + (isPausedCancellationPath + ? pausedCancelled && pausedCancellationPublished + : cancellation.durablyRecorded) || locallyAborted if (success) { const workspaceId = workflowAuthorization.workflow?.workspaceId @@ -120,14 +256,30 @@ export const POST = withRouteHandler( ) } + const durablyRecorded = isPausedCancellationPath + ? pausedCancellationPublished + : pausedCancelled || cancellation.durablyRecorded + const reason = pausedCancellationPublishFailed + ? 'paused_event_publish_failed' + : !pausedCancelled && isPausedCancellationPath + ? 'paused_database_cancel_failed' + : pausedCancelled && !pausedCancellationPublished + ? 'paused_event_publish_failed' + : pausedCancelled || isPausedCancellationPath + ? 'recorded' + : cancellation.reason + return NextResponse.json({ success, executionId, - redisAvailable: cancellation.reason !== 'redis_unavailable', - durablyRecorded: cancellation.durablyRecorded, + redisAvailable: + isPausedCancellationPath || pausedCancelled + ? pausedCancellationPublished + : cancellation.reason !== 'redis_unavailable', + durablyRecorded, locallyAborted, pausedCancelled, - reason: cancellation.reason, + reason, }) } catch (error: any) { logger.error('Failed to cancel execution', { workflowId, executionId, error: error.message }) diff --git a/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.test.ts b/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.test.ts new file mode 100644 index 00000000000..5e41a225e9e --- /dev/null +++ b/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.test.ts @@ -0,0 +1,266 @@ +/** + * @vitest-environment node + */ +import { createMockRequest } from '@sim/testing' +import { beforeEach, describe, expect, it, vi } from 'vitest' +import type { ExecutionEventEntry } from '@/lib/execution/event-buffer' + +const { + mockAuthorizeWorkflowByWorkspacePermission, + mockGetSession, + mockReadExecutionEventsState, + mockReadExecutionMetaState, +} = vi.hoisted(() => ({ + mockAuthorizeWorkflowByWorkspacePermission: vi.fn(), + mockGetSession: vi.fn(), + mockReadExecutionEventsState: vi.fn(), + mockReadExecutionMetaState: vi.fn(), +})) + +vi.mock('@/lib/auth', () => ({ + getSession: mockGetSession, +})) + +vi.mock('@sim/workflow-authz', () => ({ + authorizeWorkflowByWorkspacePermission: mockAuthorizeWorkflowByWorkspacePermission, +})) + +vi.mock('@/lib/execution/event-buffer', () => ({ + readExecutionEventsState: mockReadExecutionEventsState, + readExecutionMetaState: mockReadExecutionMetaState, +})) + +import { GET } from './route' + +function completedEntry(eventId: number): ExecutionEventEntry { + return { + eventId, + executionId: 'exec-1', + event: { + type: 'execution:completed', + timestamp: new Date().toISOString(), + executionId: 'exec-1', + workflowId: 'wf-1', + data: { + success: true, + output: {}, + duration: 10, + startTime: new Date().toISOString(), + endTime: new Date().toISOString(), + finalBlockLogs: [], + }, + }, + } +} + +describe('execution stream reconnect route', () => { + beforeEach(() => { + vi.clearAllMocks() + mockGetSession.mockResolvedValue({ user: { id: 'user-1' } }) + mockAuthorizeWorkflowByWorkspacePermission.mockResolvedValue({ allowed: true }) + mockReadExecutionMetaState.mockResolvedValue({ + status: 'found', + meta: { status: 'active', workflowId: 'wf-1' }, + }) + mockReadExecutionEventsState.mockResolvedValue({ status: 'ok', events: [] }) + }) + + it('drains final events after terminal meta before sending DONE', async () => { + mockReadExecutionMetaState + .mockResolvedValueOnce({ + status: 'found', + meta: { status: 'active', workflowId: 'wf-1' }, + }) + .mockResolvedValueOnce({ + status: 'found', + meta: { status: 'complete', workflowId: 'wf-1' }, + }) + mockReadExecutionEventsState + .mockResolvedValueOnce({ status: 'ok', events: [] }) + .mockResolvedValueOnce({ status: 'ok', events: [completedEntry(4)] }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + const body = await response.text() + const completedIndex = body.indexOf('"type":"execution:completed"') + const doneIndex = body.indexOf('data: [DONE]') + + expect(completedIndex).toBeGreaterThanOrEqual(0) + expect(doneIndex).toBeGreaterThan(completedIndex) + expect(mockReadExecutionEventsState).toHaveBeenNthCalledWith(1, 'exec-1', 3) + expect(mockReadExecutionEventsState).toHaveBeenNthCalledWith(2, 'exec-1', 3) + }) + + it('errors when terminal metadata has no terminal event to replay', async () => { + mockReadExecutionMetaState + .mockResolvedValueOnce({ + status: 'found', + meta: { status: 'active', workflowId: 'wf-1' }, + }) + .mockResolvedValueOnce({ + status: 'found', + meta: { status: 'complete', workflowId: 'wf-1' }, + }) + mockReadExecutionEventsState + .mockResolvedValueOnce({ status: 'ok', events: [] }) + .mockResolvedValueOnce({ status: 'ok', events: [] }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + await expect(response.text()).rejects.toThrow( + 'Execution reached terminal metadata without a terminal event' + ) + }) + + it('allows replay event id gaps from reserved but unused writer ids', async () => { + mockReadExecutionEventsState.mockResolvedValueOnce({ + status: 'ok', + events: [completedEntry(101)], + }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + const body = await response.text() + + expect(body).toContain('"eventId":101') + expect(body).toContain('data: [DONE]') + }) + + it('errors when replay events are not strictly increasing', async () => { + mockReadExecutionEventsState.mockResolvedValueOnce({ + status: 'ok', + events: [completedEntry(3)], + }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + await expect(response.text()).rejects.toThrow( + 'Execution event replay order violation: previous 3, received 3' + ) + }) + + it('returns unavailable when metadata cannot be read', async () => { + mockReadExecutionMetaState.mockResolvedValueOnce({ + status: 'unavailable', + error: 'redis unavailable', + }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(503) + await expect(response.json()).resolves.toEqual({ + error: 'Run buffer temporarily unavailable', + }) + }) + + it('stops after replaying a terminal event even when metadata is still active', async () => { + mockReadExecutionEventsState.mockResolvedValueOnce({ + status: 'ok', + events: [completedEntry(4)], + }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + const body = await response.text() + + expect(body).toContain('"type":"execution:completed"') + expect(body).toContain('data: [DONE]') + expect(mockReadExecutionEventsState).toHaveBeenCalledTimes(1) + expect(mockReadExecutionMetaState).toHaveBeenCalledTimes(1) + }) + + it('errors the stream when replay events cannot be read', async () => { + mockReadExecutionEventsState.mockResolvedValueOnce({ + status: 'unavailable', + error: 'redis read failed', + }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + await expect(response.text()).rejects.toThrow('Execution events unavailable: redis read failed') + }) + + it('errors the stream when requested events were pruned', async () => { + mockReadExecutionEventsState.mockResolvedValueOnce({ + status: 'pruned', + earliestEventId: 10, + }) + + const req = createMockRequest( + 'GET', + undefined, + undefined, + 'http://localhost/api/workflows/wf-1/executions/exec-1/stream?from=3' + ) + const response = await GET(req, { + params: Promise.resolve({ id: 'wf-1', executionId: 'exec-1' }), + }) + + expect(response.status).toBe(200) + await expect(response.text()).rejects.toThrow( + 'Execution events pruned before requested event id' + ) + }) +}) diff --git a/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.ts b/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.ts index 4775306d0c4..6915a8dcbc1 100644 --- a/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.ts +++ b/apps/sim/app/api/workflows/[id]/executions/[executionId]/stream/route.ts @@ -9,10 +9,12 @@ import { getSession } from '@/lib/auth' import { SSE_HEADERS } from '@/lib/core/utils/sse' import { withRouteHandler } from '@/lib/core/utils/with-route-handler' import { + type ExecutionEventEntry, type ExecutionStreamStatus, - getExecutionMeta, - readExecutionEvents, + readExecutionEventsState, + readExecutionMetaState, } from '@/lib/execution/event-buffer' +import type { ExecutionEvent } from '@/lib/workflows/executor/execution-events' import { formatSSEEvent } from '@/lib/workflows/executor/execution-events' const logger = createLogger('ExecutionStreamReconnectAPI') @@ -24,6 +26,15 @@ function isTerminalStatus(status: ExecutionStreamStatus): boolean { return status === 'complete' || status === 'error' || status === 'cancelled' } +function isTerminalEvent(event: ExecutionEvent): boolean { + return ( + event.type === 'execution:completed' || + event.type === 'execution:error' || + event.type === 'execution:cancelled' || + event.type === 'execution:paused' + ) +} + export const runtime = 'nodejs' export const dynamic = 'force-dynamic' @@ -52,10 +63,14 @@ export const GET = withRouteHandler( ) } - const meta = await getExecutionMeta(executionId) - if (!meta) { + const metaResult = await readExecutionMetaState(executionId) + if (metaResult.status === 'unavailable') { + return NextResponse.json({ error: 'Run buffer temporarily unavailable' }, { status: 503 }) + } + if (metaResult.status === 'missing') { return NextResponse.json({ error: 'Run buffer not found or expired' }, { status: 404 }) } + const { meta } = metaResult if (meta.workflowId && meta.workflowId !== workflowId) { return NextResponse.json({ error: 'Run does not belong to this workflow' }, { status: 403 }) @@ -86,19 +101,68 @@ export const GET = withRouteHandler( } } - try { - const events = await readExecutionEvents(executionId, lastEventId) + const readEventsOrThrow = async ( + afterEventId: number + ): Promise => { + const result = await readExecutionEventsState(executionId, afterEventId) + if (result.status === 'unavailable') { + throw new Error(`Execution events unavailable: ${result.error}`) + } + if (result.status === 'pruned') { + throw new Error( + `Execution events pruned before requested event id: earliest retained event is ${result.earliestEventId}` + ) + } + let previousEventId = afterEventId + for (const entry of result.events) { + if (entry.eventId <= previousEventId) { + throw new Error( + `Execution event replay order violation: previous ${previousEventId}, received ${entry.eventId}` + ) + } + previousEventId = entry.eventId + } + return result.events + } + + const enqueueEvents = (events: ExecutionEventEntry[]) => { + let sawTerminalEvent = false for (const entry of events) { - if (closed) return + if (closed) break entry.event.eventId = entry.eventId enqueue(formatSSEEvent(entry.event)) lastEventId = entry.eventId + sawTerminalEvent ||= isTerminalEvent(entry.event) + } + return sawTerminalEvent + } + + const closeWithDone = () => { + enqueue('data: [DONE]\n\n') + if (!closed) controller.close() + } + + const closeAfterTerminalEvent = (events: ExecutionEventEntry[]) => { + if (!enqueueEvents(events)) { + throw new Error('Execution reached terminal metadata without a terminal event') + } + closeWithDone() + } + + try { + const events = await readEventsOrThrow(lastEventId) + if (enqueueEvents(events)) { + closeWithDone() + return } - const currentMeta = await getExecutionMeta(executionId) - if (!currentMeta || isTerminalStatus(currentMeta.status)) { - enqueue('data: [DONE]\n\n') - if (!closed) controller.close() + const currentMeta = await readExecutionMetaState(executionId) + if (currentMeta.status === 'unavailable') { + throw new Error(`Execution metadata unavailable: ${currentMeta.error}`) + } + if (currentMeta.status === 'missing' || isTerminalStatus(currentMeta.meta.status)) { + const finalEvents = await readEventsOrThrow(lastEventId) + closeAfterTerminalEvent(finalEvents) return } @@ -106,33 +170,26 @@ export const GET = withRouteHandler( await sleep(POLL_INTERVAL_MS) if (closed) return - const newEvents = await readExecutionEvents(executionId, lastEventId) - for (const entry of newEvents) { - if (closed) return - entry.event.eventId = entry.eventId - enqueue(formatSSEEvent(entry.event)) - lastEventId = entry.eventId + const newEvents = await readEventsOrThrow(lastEventId) + if (enqueueEvents(newEvents)) { + closeWithDone() + return } - const polledMeta = await getExecutionMeta(executionId) - if (!polledMeta || isTerminalStatus(polledMeta.status)) { - const finalEvents = await readExecutionEvents(executionId, lastEventId) - for (const entry of finalEvents) { - if (closed) return - entry.event.eventId = entry.eventId - enqueue(formatSSEEvent(entry.event)) - lastEventId = entry.eventId - } - enqueue('data: [DONE]\n\n') - if (!closed) controller.close() + const polledMeta = await readExecutionMetaState(executionId) + if (polledMeta.status === 'unavailable') { + throw new Error(`Execution metadata unavailable: ${polledMeta.error}`) + } + if (polledMeta.status === 'missing' || isTerminalStatus(polledMeta.meta.status)) { + const finalEvents = await readEventsOrThrow(lastEventId) + closeAfterTerminalEvent(finalEvents) return } } if (!closed) { logger.warn('Reconnection stream poll deadline reached', { executionId }) - enqueue('data: [DONE]\n\n') - controller.close() + throw new Error('Execution stream ended before a terminal event was available') } } catch (error) { logger.error('Error in reconnection stream', { @@ -141,7 +198,7 @@ export const GET = withRouteHandler( }) if (!closed) { try { - controller.close() + controller.error(error) } catch {} } } diff --git a/apps/sim/app/workspace/[workspaceId]/home/hooks/use-chat.ts b/apps/sim/app/workspace/[workspaceId]/home/hooks/use-chat.ts index f320d44d46a..03215172116 100644 --- a/apps/sim/app/workspace/[workspaceId]/home/hooks/use-chat.ts +++ b/apps/sim/app/workspace/[workspaceId]/home/hooks/use-chat.ts @@ -4596,7 +4596,7 @@ export function useChat( }).catch(() => {}) } - consoleStore.cancelRunningEntries(workflowId) + consoleStore.cancelRunningEntries(workflowId, executionId ?? undefined) const now = new Date() consoleStore.addConsole({ input: {}, diff --git a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/hooks/use-workflow-execution.ts b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/hooks/use-workflow-execution.ts index f51ed9d8b13..e71041afd05 100644 --- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/hooks/use-workflow-execution.ts +++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/hooks/use-workflow-execution.ts @@ -10,6 +10,7 @@ import { requestJson } from '@/lib/api/client/request' import { cancelWorkflowExecutionContract, workflowLogContract } from '@/lib/api/contracts/workflows' import { buildTraceSpans } from '@/lib/logs/execution/trace-spans/trace-spans' import { processStreamingBlockLogs } from '@/lib/tokenization' +import type { ExecutionPausedData } from '@/lib/workflows/executor/execution-events' import { extractTriggerMockPayload, selectBestTrigger, @@ -37,7 +38,12 @@ import { hasExecutionResult } from '@/executor/utils/errors' import { coerceValue } from '@/executor/utils/start-block' import { subscriptionKeys } from '@/hooks/queries/subscription' import { getWorkflows } from '@/hooks/queries/utils/workflow-cache' -import { isExecutionStreamHttpError, useExecutionStream } from '@/hooks/use-execution-stream' +import { + isExecutionStreamHttpError, + SSEEventHandlerError, + SSEStreamInterruptedError, + useExecutionStream, +} from '@/hooks/use-execution-stream' import { WorkflowValidationError } from '@/serializer' import { defaultWorkflowExecutionState, useExecutionStore } from '@/stores/execution' import { useNotificationStore } from '@/stores/notifications' @@ -63,10 +69,12 @@ const logger = createLogger('useWorkflowExecution') */ const activeReconnections = new Set() -function isReconnectTerminal(error: unknown): boolean { +function isReconnectNonRetryable(error: unknown): boolean { + const message = error instanceof Error ? error.message : '' return ( - isExecutionStreamHttpError(error) && - (error.httpStatus === 404 || error.httpStatus === 403 || error.httpStatus === 401) + message.includes('Execution events pruned before requested event id') || + (isExecutionStreamHttpError(error) && + (error.httpStatus === 404 || error.httpStatus === 403 || error.httpStatus === 401)) ) } @@ -77,10 +85,25 @@ interface DebugValidationResult { const WORKFLOW_EXECUTION_FAILURE_MESSAGE = 'Workflow execution failed' +async function persistExecutionPointerProgress( + workflowId: string, + executionId: string, + lastEventId: number +): Promise { + await consolePersistence.persist() + await saveExecutionPointer({ workflowId, executionId, lastEventId }) +} + function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null } +function isRecoverableStreamRecoveryError( + error: unknown +): error is SSEEventHandlerError | SSEStreamInterruptedError { + return error instanceof SSEEventHandlerError || error instanceof SSEStreamInterruptedError +} + function sanitizeMessage(value: unknown): string | undefined { if (typeof value !== 'string') return undefined const trimmed = value.trim() @@ -120,16 +143,23 @@ export function useWorkflowExecution() { const queryClient = useQueryClient() const currentWorkflow = useCurrentWorkflow() const activeWorkflowId = useWorkflowRegistry((s) => s.activeWorkflowId) - const { toggleConsole, addConsole, updateConsole, cancelRunningEntries, clearExecutionEntries } = - useTerminalConsoleStore( - useShallow((s) => ({ - toggleConsole: s.toggleConsole, - addConsole: s.addConsole, - updateConsole: s.updateConsole, - cancelRunningEntries: s.cancelRunningEntries, - clearExecutionEntries: s.clearExecutionEntries, - })) - ) + const { + toggleConsole, + addConsole, + updateConsole, + cancelRunningEntries, + finishRunningEntries, + clearExecutionEntries, + } = useTerminalConsoleStore( + useShallow((s) => ({ + toggleConsole: s.toggleConsole, + addConsole: s.addConsole, + updateConsole: s.updateConsole, + cancelRunningEntries: s.cancelRunningEntries, + finishRunningEntries: s.finishRunningEntries, + clearExecutionEntries: s.clearExecutionEntries, + })) + ) const hasHydrated = useTerminalConsoleStore((s) => s._hasHydrated) const { getVariablesByWorkflowId, variables } = useVariablesStore( useShallow((s) => ({ @@ -157,10 +187,15 @@ export function useWorkflowExecution() { const setIsExecuting = useCallback( (workflowId: string, executing: boolean) => { + const wasExecuting = useExecutionStore.getState().getWorkflowExecution(workflowId).isExecuting if (executing) { - consolePersistence.executionStarted() + if (!wasExecuting) { + consolePersistence.executionStarted() + } } else { - consolePersistence.executionEnded() + if (wasExecuting) { + consolePersistence.executionEnded() + } clearExecutionPointer(workflowId) } rawSetIsExecuting(workflowId, executing) @@ -178,8 +213,10 @@ export function useWorkflowExecution() { const getLastExecutionSnapshot = useExecutionStore((s) => s.getLastExecutionSnapshot) const clearLastExecutionSnapshot = useExecutionStore((s) => s.clearLastExecutionSnapshot) const [executionResult, setExecutionResult] = useState(null) + const [reconnectAttemptNonce, setReconnectAttemptNonce] = useState(0) const executionStream = useExecutionStream() const currentChatExecutionIdRef = useRef(null) + const runFromBlockOwnerRef = useRef(null) const lastSeenEventIdRef = useRef(0) const isViewingDiff = useWorkflowDiffStore((state) => state.isShowingDiff) const addNotification = useNotificationStore((state) => state.addNotification) @@ -438,6 +475,7 @@ export function useWorkflowExecution() { if (isChatExecution) { let isCancelled = false const executionId = generateId() + let preserveChatExecutionForRecovery = false currentChatExecutionIdRef.current = executionId const stream = new ReadableStream({ async start(controller) { @@ -712,6 +750,15 @@ export function useWorkflowExecution() { // Note: Logs are already persisted server-side via execution-core.ts } } catch (error: any) { + if (isRecoverableStreamRecoveryError(error)) { + preserveChatExecutionForRecovery = true + logger.warn('Chat workflow stream interrupted; waiting for reconnect replay', { + workflowId: activeWorkflowId, + executionId: error.executionId, + error: error.message, + }) + return + } // Create a proper error result for logging const errorResult = { success: false, @@ -733,7 +780,10 @@ export function useWorkflowExecution() { if (!isCancelled) { controller.close() } - if (currentChatExecutionIdRef.current === executionId) { + if ( + !preserveChatExecutionForRecovery && + currentChatExecutionIdRef.current === executionId + ) { setIsExecuting(activeWorkflowId, false) setIsDebugging(activeWorkflowId, false) setActiveBlocks(activeWorkflowId, new Set()) @@ -764,6 +814,10 @@ export function useWorkflowExecution() { } return result } catch (error: any) { + if (isRecoverableStreamRecoveryError(error)) { + handleExecutionError(error, { executionId: manualExecutionId }) + throw error + } const errorResult = handleExecutionError(error, { executionId: manualExecutionId }) return errorResult } @@ -1047,6 +1101,7 @@ export function useWorkflowExecution() { loops: latestWorkflowState.loops, parallels: latestWorkflowState.parallels, } + let executionFinished = false await executionStream.execute({ workflowId: activeWorkflowId, @@ -1073,8 +1128,16 @@ export function useWorkflowExecution() { }) }, callbacks: { - onEventId: (eventId) => { + onEventId: async (eventId) => { + if (executionFinished) return lastSeenEventIdRef.current = eventId + if (eventId % 5 === 0 && activeWorkflowId && executionIdRef.current) { + await persistExecutionPointerProgress( + activeWorkflowId, + executionIdRef.current, + eventId + ) + } }, onExecutionStarted: (data) => { @@ -1121,6 +1184,7 @@ export function useWorkflowExecution() { }, onExecutionCompleted: (data) => { + executionFinished = true if ( activeWorkflowId && executionIdRef.current && @@ -1137,7 +1201,7 @@ export function useWorkflowExecution() { executionIdRef.current, data.finalBlockLogs ) - cancelRunningEntries(activeWorkflowId) + finishRunningEntries(activeWorkflowId, executionIdRef.current) } executionResult = { @@ -1216,7 +1280,52 @@ export function useWorkflowExecution() { } }, + onExecutionPaused: (data: ExecutionPausedData) => { + executionFinished = true + if ( + activeWorkflowId && + executionIdRef.current && + useExecutionStore.getState().getCurrentExecutionId(activeWorkflowId) !== + executionIdRef.current + ) + return + + if (activeWorkflowId) { + setCurrentExecutionId(activeWorkflowId, null) + reconcileFinalBlockLogs( + updateConsole, + activeWorkflowId, + executionIdRef.current, + data.finalBlockLogs + ) + finishRunningEntries(activeWorkflowId, executionIdRef.current) + } + + executionResult = { + success: true, + output: data.output, + metadata: { + duration: data.duration, + startTime: data.startTime, + endTime: data.endTime, + }, + logs: accumulatedBlockLogs, + } + + const workflowExecState = activeWorkflowId + ? useExecutionStore.getState().getWorkflowExecution(activeWorkflowId) + : null + if (activeWorkflowId && !workflowExecState?.isDebugging) { + setExecutionResult(executionResult) + if (!isExecutingFromChat) { + setIsExecuting(activeWorkflowId, false) + setActiveBlocks(activeWorkflowId, new Set()) + } + } + }, + onExecutionError: (data) => { + executionFinished = true if ( activeWorkflowId && executionIdRef.current && @@ -1258,6 +1367,7 @@ export function useWorkflowExecution() { }, onExecutionCancelled: (data) => { + executionFinished = true if ( activeWorkflowId && executionIdRef.current && @@ -1288,6 +1398,10 @@ export function useWorkflowExecution() { return executionResult } catch (error: any) { + if (isRecoverableStreamRecoveryError(error)) { + handleExecutionError(error, { executionId: executionIdRef.current }) + throw error + } if (error.name === 'AbortError' || error.message?.includes('aborted')) { logger.info('Execution aborted by user') return executionResult @@ -1345,6 +1459,12 @@ export function useWorkflowExecution() { blockName: error.blockName || 'Workflow', blockType: error.blockType || 'serializer', }) + } else if (isRecoverableStreamRecoveryError(error)) { + logger.warn('Execution stream needs reconnect without authoritative terminal state', { + workflowId: activeWorkflowId, + executionId: error.executionId ?? options?.executionId, + error: error.message, + }) } else { sharedAddExecutionErrorConsoleEntry(storeAddConsole, { workflowId: activeWorkflowId || '', @@ -1365,6 +1485,13 @@ export function useWorkflowExecution() { } } + if (isRecoverableStreamRecoveryError(error)) { + if (activeWorkflowId) { + setReconnectAttemptNonce((nonce) => nonce + 1) + } + return errorResult + } + setExecutionResult(errorResult) if (activeWorkflowId) { setIsExecuting(activeWorkflowId, false) @@ -1545,21 +1672,41 @@ export function useWorkflowExecution() { const storedExecutionId = getCurrentExecutionId(activeWorkflowId) if (storedExecutionId) { - setCurrentExecutionId(activeWorkflowId, null) - requestJson(cancelWorkflowExecutionContract, { + void requestJson(cancelWorkflowExecutionContract, { params: { id: activeWorkflowId, executionId: storedExecutionId }, - }).catch(() => {}) - handleExecutionCancelledConsole({ - workflowId: activeWorkflowId, - executionId: storedExecutionId, }) - } + .then((result) => { + if (!result.success) { + logger.warn('Workflow execution cancellation was not confirmed', { + workflowId: activeWorkflowId, + executionId: storedExecutionId, + reason: result.reason, + }) + return + } - executionStream.cancel(activeWorkflowId) - currentChatExecutionIdRef.current = null - setIsExecuting(activeWorkflowId, false) - setIsDebugging(activeWorkflowId, false) - setActiveBlocks(activeWorkflowId, new Set()) + const currentId = getCurrentExecutionId(activeWorkflowId) + if (currentId !== storedExecutionId) return + + logger.info('Workflow execution cancellation confirmed; awaiting terminal event', { + workflowId: activeWorkflowId, + executionId: storedExecutionId, + }) + }) + .catch((error) => { + logger.warn('Failed to request workflow execution cancellation', { + workflowId: activeWorkflowId, + executionId: storedExecutionId, + error, + }) + }) + } else { + executionStream.cancel(activeWorkflowId) + currentChatExecutionIdRef.current = null + setIsExecuting(activeWorkflowId, false) + setIsDebugging(activeWorkflowId, false) + setActiveBlocks(activeWorkflowId, new Set()) + } if (isDebugging) { resetDebugState() @@ -1573,8 +1720,6 @@ export function useWorkflowExecution() { setActiveBlocks, activeWorkflowId, getCurrentExecutionId, - setCurrentExecutionId, - handleExecutionCancelledConsole, ]) /** @@ -1674,13 +1819,29 @@ export function useWorkflowExecution() { } setIsExecuting(workflowId, true) + const runOwnerId = generateId() + runFromBlockOwnerRef.current = runOwnerId const executionIdRef = { current: '' } const accumulatedBlockLogs: BlockLog[] = [] const accumulatedBlockStates = new Map() const executedBlockIds = new Set() const activeBlocksSet = new Set() const activeBlockRefCounts = new Map() + const isCurrentRunFromBlockExecution = () => { + return ( + Boolean(executionIdRef.current) && + getCurrentExecutionId(workflowId) === executionIdRef.current + ) + } + const clearRunFromBlockExecutionState = () => { + if (!isCurrentRunFromBlockExecution()) return false + setCurrentExecutionId(workflowId, null) + setIsExecuting(workflowId, false) + setActiveBlocks(workflowId, new Set()) + return true + } + let preserveExecutionForRecovery = false try { const blockHandlers = buildBlockEventHandlers({ workflowId, @@ -1700,23 +1861,33 @@ export function useWorkflowExecution() { sourceSnapshot: effectiveSnapshot, input: workflowInput, onExecutionId: (id) => { + if (runFromBlockOwnerRef.current !== runOwnerId) return executionIdRef.current = id setCurrentExecutionId(workflowId, id) + saveExecutionPointer({ + workflowId, + executionId: id, + lastEventId: 0, + }) }, callbacks: { + onEventId: async (eventId) => { + if (executionIdRef.current && !isCurrentRunFromBlockExecution()) return + if (eventId % 5 === 0 && executionIdRef.current) { + await persistExecutionPointerProgress(workflowId, executionIdRef.current, eventId) + } + }, + onBlockStarted: blockHandlers.onBlockStarted, onBlockCompleted: blockHandlers.onBlockCompleted, onBlockError: blockHandlers.onBlockError, onBlockChildWorkflowStarted: blockHandlers.onBlockChildWorkflowStarted, onExecutionCompleted: (data) => { - reconcileFinalBlockLogs( - updateConsole, - workflowId, - executionIdRef.current, - data.finalBlockLogs - ) - cancelRunningEntries(workflowId) + if (!isCurrentRunFromBlockExecution()) return + const executionId = executionIdRef.current + reconcileFinalBlockLogs(updateConsole, workflowId, executionId, data.finalBlockLogs) + finishRunningEntries(workflowId, executionId) if (data.success) { executedBlockIds.add(blockId) @@ -1743,12 +1914,31 @@ export function useWorkflowExecution() { setLastExecutionSnapshot(workflowId, updatedSnapshot) } - setCurrentExecutionId(workflowId, null) - setIsExecuting(workflowId, false) - setActiveBlocks(workflowId, new Set()) + clearRunFromBlockExecutionState() + }, + + onExecutionPaused: (data) => { + if (!isCurrentRunFromBlockExecution()) return + const executionId = executionIdRef.current + reconcileFinalBlockLogs(updateConsole, workflowId, executionId, data.finalBlockLogs) + finishRunningEntries(workflowId, executionId) + + clearRunFromBlockExecutionState() + setExecutionResult({ + success: true, + output: data.output, + metadata: { + duration: data.duration, + startTime: data.startTime, + endTime: data.endTime, + }, + logs: accumulatedBlockLogs, + }) }, onExecutionError: (data) => { + if (!isCurrentRunFromBlockExecution()) return + const executionId = executionIdRef.current const isWorkflowModified = data.error?.includes('Block not found in workflow') || data.error?.includes('Upstream dependency not executed') @@ -1765,42 +1955,70 @@ export function useWorkflowExecution() { handleExecutionErrorConsole({ workflowId, - executionId: executionIdRef.current, + executionId, error: data.error, durationMs: data.duration, blockLogs: accumulatedBlockLogs, finalBlockLogs: data.finalBlockLogs, }) - setCurrentExecutionId(workflowId, null) - setIsExecuting(workflowId, false) - setActiveBlocks(workflowId, new Set()) + clearRunFromBlockExecutionState() }, onExecutionCancelled: (data) => { + if (!isCurrentRunFromBlockExecution()) return + const executionId = executionIdRef.current handleExecutionCancelledConsole({ workflowId, - executionId: executionIdRef.current, + executionId, durationMs: data?.duration, finalBlockLogs: data?.finalBlockLogs, }) - setCurrentExecutionId(workflowId, null) - setIsExecuting(workflowId, false) - setActiveBlocks(workflowId, new Set()) + clearRunFromBlockExecutionState() }, }, }) } catch (error) { - if ((error as Error).name !== 'AbortError') { + if (isRecoverableStreamRecoveryError(error)) { + preserveExecutionForRecovery = true + logger.warn('Run-from-block stream interrupted; preserving execution for replay', { + workflowId, + executionId: error.executionId ?? executionIdRef.current, + eventType: error instanceof SSEEventHandlerError ? error.eventType : undefined, + eventId: error instanceof SSEEventHandlerError ? error.eventId : undefined, + error: error.message, + }) + setReconnectAttemptNonce((nonce) => nonce + 1) + } else if ((error as Error).name !== 'AbortError') { logger.error('Run-from-block failed:', error) } } finally { - const currentId = getCurrentExecutionId(workflowId) - if (currentId === null || currentId === executionIdRef.current) { - setCurrentExecutionId(workflowId, null) - setIsExecuting(workflowId, false) - setActiveBlocks(workflowId, new Set()) + if (preserveExecutionForRecovery) { + if (runFromBlockOwnerRef.current === runOwnerId) { + runFromBlockOwnerRef.current = null + } + } else { + const currentId = getCurrentExecutionId(workflowId) + if (executionIdRef.current && currentId === executionIdRef.current) { + setCurrentExecutionId(workflowId, null) + setIsExecuting(workflowId, false) + setActiveBlocks(workflowId, new Set()) + if (runFromBlockOwnerRef.current === runOwnerId) { + runFromBlockOwnerRef.current = null + } + } else if ( + !executionIdRef.current && + currentId === null && + runFromBlockOwnerRef.current === runOwnerId + ) { + const workflowExecState = useExecutionStore.getState().getWorkflowExecution(workflowId) + if (workflowExecState.isExecuting) { + setIsExecuting(workflowId, false) + setActiveBlocks(workflowId, new Set()) + } + runFromBlockOwnerRef.current = null + } } } }, @@ -1814,6 +2032,9 @@ export function useWorkflowExecution() { setActiveBlocks, setBlockRunStatus, setEdgeRunStatus, + updateConsole, + finishRunningEntries, + setExecutionResult, addNotification, buildBlockEventHandlers, handleExecutionErrorConsole, @@ -1843,21 +2064,9 @@ export function useWorkflowExecution() { } catch (error) { const errorResult = handleExecutionError(error, { executionId }) return errorResult - } finally { - setCurrentExecutionId(workflowId, null) - setIsExecuting(workflowId, false) - setIsDebugging(workflowId, false) - setActiveBlocks(workflowId, new Set()) } }, - [ - activeWorkflowId, - setCurrentExecutionId, - setExecutionResult, - setIsExecuting, - setIsDebugging, - setActiveBlocks, - ] + [activeWorkflowId, setExecutionResult, setIsExecuting] ) useEffect(() => { @@ -1866,8 +2075,16 @@ export function useWorkflowExecution() { let cleanupRan = false let reconnectionComplete = false + let ownsReconnect = false + let ownedReconnectExecutionId: string | null = null const reconnectWorkflowId = activeWorkflowId + const releaseReconnectOwnership = () => { + activeReconnections.delete(reconnectWorkflowId) + ownsReconnect = false + ownedReconnectExecutionId = null + } + const runReconnect = async () => { let executionId: string | undefined let fromEventId = 0 @@ -1883,34 +2100,30 @@ export function useWorkflowExecution() { // fall through to console entries } - if (!executionId) { - const entries = useTerminalConsoleStore.getState().getWorkflowEntries(reconnectWorkflowId) - const runningEntries = entries.filter( - (e) => e.isRunning && e.workflowId === reconnectWorkflowId && e.executionId - ) - if (runningEntries.length === 0) return - - const sorted = [...runningEntries].sort((a, b) => { - const aTime = a.startedAt ? new Date(a.startedAt).getTime() : 0 - const bTime = b.startedAt ? new Date(b.startedAt).getTime() : 0 - return bTime - aTime - }) - executionId = sorted[0].executionId! - - const otherExecutionIds = new Set( - sorted.filter((e) => e.executionId !== executionId).map((e) => e.executionId!) - ) - if (otherExecutionIds.size > 0) { - cancelRunningEntries(reconnectWorkflowId) - consolePersistence.persist() + if (!executionId || cleanupRan) return + const capturedExecutionId = executionId + const canReconnectClaimWorkflow = () => { + const executionState = useExecutionStore + .getState() + .getWorkflowExecution(reconnectWorkflowId) + const currentId = executionState?.currentExecutionId ?? null + if (currentId) return currentId === capturedExecutionId + return !executionState?.isExecuting + } + const clearCapturedExecutionPointer = async () => { + const pointer = await loadExecutionPointer(reconnectWorkflowId).catch(() => null) + if (pointer?.executionId === capturedExecutionId) { + await clearExecutionPointer(reconnectWorkflowId) } } - - if (!executionId || cleanupRan) return + if (!canReconnectClaimWorkflow()) { + await clearCapturedExecutionPointer() + return + } if (activeReconnections.has(reconnectWorkflowId)) return activeReconnections.add(reconnectWorkflowId) - - executionStream.cancel(reconnectWorkflowId) + ownsReconnect = true + executionStream.cancelExecute(reconnectWorkflowId) const workflowEdges = useWorkflowStore.getState().edges const activeBlocksSet = new Set() @@ -1932,47 +2145,80 @@ export function useWorkflowExecution() { includeStartConsoleEntry: true, }) - const capturedExecutionId = executionId + ownedReconnectExecutionId = capturedExecutionId const MAX_ATTEMPTS = 5 const BASE_DELAY_MS = 1000 const MAX_DELAY_MS = 15000 let activated = false + let activationStartedPersistence = false + const isReconnectStillCurrent = canReconnectClaimWorkflow + const stopStaleReconnect = () => { + reconnectionComplete = true + if (ownedReconnectExecutionId) { + executionStream.cancelReconnect(reconnectWorkflowId, ownedReconnectExecutionId) + } + releaseReconnectOwnership() + } + const releaseActivatedReconnectState = () => { + if (!activated) return + const currentId = useExecutionStore.getState().getCurrentExecutionId(reconnectWorkflowId) + if (currentId !== capturedExecutionId) return + setCurrentExecutionId(reconnectWorkflowId, null) + if (activationStartedPersistence) { + consolePersistence.executionEnded() + activationStartedPersistence = false + } + rawSetIsExecuting(reconnectWorkflowId, false) + setActiveBlocks(reconnectWorkflowId, new Set()) + } + const releaseReconnectStateWithoutTerminal = () => { + const executionState = useExecutionStore + .getState() + .getWorkflowExecution(reconnectWorkflowId) + const currentId = executionState?.currentExecutionId ?? null + if (currentId && currentId !== capturedExecutionId) return + finishRunningEntries(reconnectWorkflowId, capturedExecutionId) + setCurrentExecutionId(reconnectWorkflowId, null) + setIsExecuting(reconnectWorkflowId, false) + setActiveBlocks(reconnectWorkflowId, new Set()) + activationStartedPersistence = false + } + const scheduleRetryableReconnect = () => { + releaseReconnectOwnership() + setTimeout(() => { + if (!cleanupRan && !reconnectionComplete) { + setReconnectAttemptNonce((nonce) => nonce + 1) + } + }, MAX_DELAY_MS) + } const ensureActivated = () => { - if (activated || cleanupRan) return - activated = true - setCurrentExecutionId(reconnectWorkflowId, capturedExecutionId) - setIsExecuting(reconnectWorkflowId, true) - clearExecutionEntries(capturedExecutionId) + if (cleanupRan || reconnectionComplete) return false + if (!isReconnectStillCurrent()) { + stopStaleReconnect() + return false + } + if (!activated) { + activated = true + activationStartedPersistence = !useExecutionStore + .getState() + .getWorkflowExecution(reconnectWorkflowId).isExecuting + setCurrentExecutionId(reconnectWorkflowId, capturedExecutionId) + setIsExecuting(reconnectWorkflowId, true) + if (fromEventId === 0) { + clearExecutionEntries(capturedExecutionId) + } + } + return true } const wrapHandler = (handler: (data: T) => void) => (data: T) => { - ensureActivated() + if (!ensureActivated()) return handler(data) } - const cleanupFailedReconnect = () => { - const currentId = useExecutionStore.getState().getCurrentExecutionId(reconnectWorkflowId) - if (currentId && currentId !== capturedExecutionId) return - - const hasRunningEntry = useTerminalConsoleStore - .getState() - .getWorkflowEntries(reconnectWorkflowId) - .some((entry) => entry.isRunning && entry.executionId === capturedExecutionId) - - if (activated || hasRunningEntry) { - cancelRunningEntries(reconnectWorkflowId) - } - - if (currentId === capturedExecutionId) { - setCurrentExecutionId(reconnectWorkflowId, null) - setIsExecuting(reconnectWorkflowId, false) - setActiveBlocks(reconnectWorkflowId, new Set()) - } - } - const attemptReconnect = async (attempt: number): Promise => { if (cleanupRan || reconnectionComplete) return @@ -1988,50 +2234,81 @@ export function useWorkflowExecution() { executionId: capturedExecutionId, fromEventId, callbacks: { - onEventId: (eid) => { - ensureActivated() + onEventId: async (eid) => { + if (reconnectionComplete) return + if (!isReconnectStillCurrent()) { + stopStaleReconnect() + return + } fromEventId = eid + if (eid % 5 === 0) { + await persistExecutionPointerProgress( + reconnectWorkflowId, + capturedExecutionId, + eid + ) + } }, onBlockStarted: wrapHandler(handlers.onBlockStarted), onBlockCompleted: wrapHandler(handlers.onBlockCompleted), onBlockError: wrapHandler(handlers.onBlockError), onBlockChildWorkflowStarted: wrapHandler(handlers.onBlockChildWorkflowStarted), onExecutionCompleted: (data) => { + if (!ensureActivated()) return reconnectionComplete = true - activeReconnections.delete(reconnectWorkflowId) - if (!activated) { - clearExecutionPointer(reconnectWorkflowId) - return - } + releaseReconnectOwnership() const currentId = useExecutionStore .getState() .getCurrentExecutionId(reconnectWorkflowId) if (currentId !== capturedExecutionId) return - setCurrentExecutionId(reconnectWorkflowId, null) - setIsExecuting(reconnectWorkflowId, false) - setActiveBlocks(reconnectWorkflowId, new Set()) reconcileFinalBlockLogs( updateConsole, reconnectWorkflowId, capturedExecutionId, data?.finalBlockLogs ) - cancelRunningEntries(reconnectWorkflowId) + finishRunningEntries(reconnectWorkflowId, capturedExecutionId) + setCurrentExecutionId(reconnectWorkflowId, null) + setIsExecuting(reconnectWorkflowId, false) + setActiveBlocks(reconnectWorkflowId, new Set()) }, - onExecutionError: (data) => { + onExecutionPaused: (data) => { + if (!ensureActivated()) return reconnectionComplete = true - activeReconnections.delete(reconnectWorkflowId) - if (!activated) { - clearExecutionPointer(reconnectWorkflowId) - return - } + releaseReconnectOwnership() const currentId = useExecutionStore .getState() .getCurrentExecutionId(reconnectWorkflowId) if (currentId !== capturedExecutionId) return + reconcileFinalBlockLogs( + updateConsole, + reconnectWorkflowId, + capturedExecutionId, + data.finalBlockLogs + ) + finishRunningEntries(reconnectWorkflowId, capturedExecutionId) setCurrentExecutionId(reconnectWorkflowId, null) setIsExecuting(reconnectWorkflowId, false) setActiveBlocks(reconnectWorkflowId, new Set()) + setExecutionResult({ + success: true, + output: data.output, + metadata: { + duration: data.duration, + startTime: data.startTime, + endTime: data.endTime, + }, + logs: accumulatedBlockLogs, + }) + }, + onExecutionError: (data) => { + if (!ensureActivated()) return + reconnectionComplete = true + releaseReconnectOwnership() + const currentId = useExecutionStore + .getState() + .getCurrentExecutionId(reconnectWorkflowId) + if (currentId !== capturedExecutionId) return handleExecutionErrorConsole({ workflowId: reconnectWorkflowId, executionId: capturedExecutionId, @@ -2039,39 +2316,40 @@ export function useWorkflowExecution() { blockLogs: accumulatedBlockLogs, finalBlockLogs: data.finalBlockLogs, }) + setCurrentExecutionId(reconnectWorkflowId, null) + setIsExecuting(reconnectWorkflowId, false) + setActiveBlocks(reconnectWorkflowId, new Set()) }, onExecutionCancelled: (data) => { + if (!ensureActivated()) return reconnectionComplete = true - activeReconnections.delete(reconnectWorkflowId) - if (!activated) { - clearExecutionPointer(reconnectWorkflowId) - return - } + releaseReconnectOwnership() const currentId = useExecutionStore .getState() .getCurrentExecutionId(reconnectWorkflowId) if (currentId !== capturedExecutionId) return - setCurrentExecutionId(reconnectWorkflowId, null) - setIsExecuting(reconnectWorkflowId, false) - setActiveBlocks(reconnectWorkflowId, new Set()) handleExecutionCancelledConsole({ workflowId: reconnectWorkflowId, executionId: capturedExecutionId, durationMs: data?.duration, finalBlockLogs: data?.finalBlockLogs, }) + setCurrentExecutionId(reconnectWorkflowId, null) + setIsExecuting(reconnectWorkflowId, false) + setActiveBlocks(reconnectWorkflowId, new Set()) }, }, }) } catch (error) { - if (isReconnectTerminal(error)) { + if (isReconnectNonRetryable(error)) { logger.info('Reconnection skipped; run buffer no longer exists', { executionId: capturedExecutionId, }) reconnectionComplete = true - activeReconnections.delete(reconnectWorkflowId) - clearExecutionPointer(reconnectWorkflowId) - cleanupFailedReconnect() + releaseReconnectStateWithoutTerminal() + await consolePersistence.persist() + releaseReconnectOwnership() + await clearCapturedExecutionPointer() return } @@ -2084,27 +2362,17 @@ export function useWorkflowExecution() { return attemptReconnect(attempt + 1) } if (!cleanupRan && !reconnectionComplete) { - reconnectionComplete = true - activeReconnections.delete(reconnectWorkflowId) - cleanupFailedReconnect() + scheduleRetryableReconnect() + await consolePersistence.persist() return } } if (!reconnectionComplete && !cleanupRan) { reconnectionComplete = true - activeReconnections.delete(reconnectWorkflowId) - if (activated) { - const currentId = useExecutionStore - .getState() - .getCurrentExecutionId(reconnectWorkflowId) - if (currentId === capturedExecutionId) { - cancelRunningEntries(reconnectWorkflowId) - setCurrentExecutionId(reconnectWorkflowId, null) - setIsExecuting(reconnectWorkflowId, false) - setActiveBlocks(reconnectWorkflowId, new Set()) - } - } + releaseActivatedReconnectState() + await consolePersistence.persist() + releaseReconnectOwnership() } } @@ -2115,11 +2383,15 @@ export function useWorkflowExecution() { return () => { cleanupRan = true - executionStream.cancel(reconnectWorkflowId) - activeReconnections.delete(reconnectWorkflowId) + if (ownsReconnect) { + if (ownedReconnectExecutionId) { + executionStream.cancelReconnect(reconnectWorkflowId, ownedReconnectExecutionId) + } + releaseReconnectOwnership() + } } // eslint-disable-next-line react-hooks/exhaustive-deps - }, [activeWorkflowId, hasHydrated]) + }, [activeWorkflowId, hasHydrated, reconnectAttemptNonce]) return { isExecuting, diff --git a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.test.ts b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.test.ts index 13840aad4cd..d2c999beef0 100644 --- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.test.ts +++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.test.ts @@ -5,14 +5,196 @@ import { resetTerminalConsoleMock, terminalConsoleMockFns } from '@sim/testing' import { beforeEach, describe, expect, it, vi } from 'vitest' import { addExecutionErrorConsoleEntry, + createBlockEventHandlers, handleExecutionErrorConsole, reconcileFinalBlockLogs, } from '@/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils' import type { BlockLog } from '@/executor/types' +import { useExecutionStore } from '@/stores/execution' describe('workflow-execution-utils', () => { beforeEach(() => { resetTerminalConsoleMock() + vi.mocked(useExecutionStore.getState).mockReturnValue({ + getCurrentExecutionId: vi.fn(() => 'exec-1'), + } as any) + }) + + describe('createBlockEventHandlers', () => { + it('skips duplicate block start rows during reconnect replay', () => { + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'fn-1', + blockName: 'Function 1', + blockType: 'function', + executionId: 'exec-1', + executionOrder: 7, + isRunning: false, + success: true, + iterationCurrent: 0, + iterationTotal: 2, + iterationType: 'loop', + iterationContainerId: 'loop-1', + childWorkflowBlockId: 'child-inst-1', + childWorkflowName: 'Child Workflow', + parentIterations: [ + { + iterationCurrent: 1, + iterationTotal: 3, + iterationType: 'parallel', + iterationContainerId: 'parallel-1', + }, + ], + }) + + const addConsole = vi.fn() + const handlers = createBlockEventHandlers( + { + workflowId: 'wf-1', + executionIdRef: { current: 'exec-1' }, + workflowEdges: [], + activeBlocksSet: new Set(), + activeBlockRefCounts: new Map(), + accumulatedBlockLogs: [], + accumulatedBlockStates: new Map(), + executedBlockIds: new Set(), + includeStartConsoleEntry: true, + }, + { + addConsole, + updateConsole: vi.fn(), + setActiveBlocks: vi.fn(), + setBlockRunStatus: vi.fn(), + setEdgeRunStatus: vi.fn(), + } + ) + + handlers.onBlockStarted({ + blockId: 'fn-1', + blockName: 'Function 1', + blockType: 'function', + executionOrder: 7, + iterationCurrent: 0, + iterationTotal: 2, + iterationType: 'loop', + iterationContainerId: 'loop-1', + childWorkflowBlockId: 'child-inst-1', + childWorkflowName: 'Child Workflow', + parentIterations: [ + { + iterationCurrent: 1, + iterationTotal: 3, + iterationType: 'parallel', + iterationContainerId: 'parallel-1', + }, + ], + }) + + expect(addConsole).not.toHaveBeenCalled() + }) + + it('keeps distinct start rows when replay identity differs', () => { + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'fn-1', + blockName: 'Function 1', + blockType: 'function', + executionId: 'exec-1', + executionOrder: 7, + isRunning: true, + iterationCurrent: 0, + iterationTotal: 2, + iterationType: 'loop', + iterationContainerId: 'loop-1', + }) + + const addConsole = vi.fn() + const handlers = createBlockEventHandlers( + { + workflowId: 'wf-1', + executionIdRef: { current: 'exec-1' }, + workflowEdges: [], + activeBlocksSet: new Set(), + activeBlockRefCounts: new Map(), + accumulatedBlockLogs: [], + accumulatedBlockStates: new Map(), + executedBlockIds: new Set(), + includeStartConsoleEntry: true, + }, + { + addConsole, + updateConsole: vi.fn(), + setActiveBlocks: vi.fn(), + setBlockRunStatus: vi.fn(), + setEdgeRunStatus: vi.fn(), + } + ) + + handlers.onBlockStarted({ + blockId: 'fn-1', + blockName: 'Function 1', + blockType: 'function', + executionOrder: 7, + iterationCurrent: 1, + iterationTotal: 2, + iterationType: 'loop', + iterationContainerId: 'loop-1', + }) + + expect(addConsole).toHaveBeenCalledTimes(1) + }) + + it('replays early child workflow instance updates after the start row is added', () => { + const updateConsole = vi.fn() + const handlers = createBlockEventHandlers( + { + workflowId: 'wf-1', + executionIdRef: { current: 'exec-1' }, + workflowEdges: [], + activeBlocksSet: new Set(), + activeBlockRefCounts: new Map(), + accumulatedBlockLogs: [], + accumulatedBlockStates: new Map(), + executedBlockIds: new Set(), + includeStartConsoleEntry: true, + }, + { + addConsole: terminalConsoleMockFns.mockAddConsole as any, + updateConsole, + setActiveBlocks: vi.fn(), + setBlockRunStatus: vi.fn(), + setEdgeRunStatus: vi.fn(), + } + ) + + handlers.onBlockChildWorkflowStarted({ + blockId: 'nested-workflow', + childWorkflowInstanceId: 'nested-inst-1', + executionOrder: 4, + childWorkflowBlockId: 'parent-inst-1', + childWorkflowName: 'Parent Workflow', + }) + handlers.onBlockStarted({ + blockId: 'nested-workflow', + blockName: 'Nested Workflow', + blockType: 'workflow', + executionOrder: 4, + childWorkflowBlockId: 'parent-inst-1', + childWorkflowName: 'Parent Workflow', + }) + + expect(updateConsole).toHaveBeenCalledTimes(2) + expect(updateConsole.mock.calls[1]).toEqual([ + 'nested-workflow', + expect.objectContaining({ + childWorkflowInstanceId: 'nested-inst-1', + childWorkflowBlockId: 'parent-inst-1', + childWorkflowName: 'Parent Workflow', + executionOrder: 4, + }), + 'exec-1', + ]) + }) }) describe('addExecutionErrorConsoleEntry', () => { @@ -225,6 +407,296 @@ describe('workflow-execution-utils', () => { expect(updateConsole).not.toHaveBeenCalled() }) + it('reconciles child workflow spans before running entries are swept to canceled', () => { + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'workflow-1', + blockName: 'Workflow 1', + blockType: 'workflow', + executionId: 'exec-1', + executionOrder: 2, + isRunning: false, + success: true, + childWorkflowInstanceId: 'child-inst-1', + }) + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'starter', + blockName: 'Start', + blockType: 'starter', + executionId: 'exec-1', + executionOrder: 3, + isRunning: true, + childWorkflowBlockId: 'workflow-1', + childWorkflowName: 'Workflow 1', + }) + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'api-1', + blockName: 'API 1', + blockType: 'api', + executionId: 'exec-1', + executionOrder: 4, + isRunning: true, + childWorkflowBlockId: 'child-inst-1', + childWorkflowName: 'Workflow 1', + }) + + const startedAt = new Date().toISOString() + const endedAt = new Date(Date.now() + 20).toISOString() + const updateConsole = vi.fn() + reconcileFinalBlockLogs(updateConsole, 'wf-1', 'exec-1', [ + makeLog({ + blockId: 'workflow-1', + blockName: 'Workflow 1', + blockType: 'workflow', + executionOrder: 2, + success: true, + childTraceSpans: [ + { + id: 'starter-span', + name: 'Start', + type: 'starter', + blockId: 'starter', + executionOrder: 3, + status: 'success', + duration: 5, + startTime: startedAt, + endTime: endedAt, + output: {}, + }, + { + id: 'api-span', + name: 'API 1', + type: 'api', + blockId: 'api-1', + executionOrder: 4, + status: 'error', + errorHandled: true, + duration: 20, + startTime: startedAt, + endTime: endedAt, + output: { error: 'Request failed' }, + }, + ], + }), + ]) + + expect(updateConsole).toHaveBeenCalledTimes(2) + expect(updateConsole.mock.calls[0]).toEqual([ + 'starter', + expect.objectContaining({ + success: true, + isRunning: false, + isCanceled: false, + childWorkflowBlockId: 'workflow-1', + }), + 'exec-1', + ]) + expect(updateConsole.mock.calls[1]).toEqual([ + 'api-1', + expect.objectContaining({ + executionOrder: 4, + success: false, + error: 'Request failed', + isRunning: false, + isCanceled: false, + childWorkflowBlockId: 'workflow-1', + }), + 'exec-1', + ]) + }) + + it('uses span execution and iteration identity when reconciling repeated child blocks', () => { + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'workflow-1', + blockName: 'Workflow 1', + blockType: 'workflow', + executionId: 'exec-1', + executionOrder: 2, + success: true, + childWorkflowInstanceId: 'child-inst-1', + }) + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'api-1', + blockName: 'API 1', + blockType: 'api', + executionId: 'exec-1', + executionOrder: 3, + isRunning: true, + iterationCurrent: 0, + iterationType: 'loop', + iterationContainerId: 'loop-1', + childWorkflowBlockId: 'workflow-1', + }) + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'api-1', + blockName: 'API 1', + blockType: 'api', + executionId: 'exec-1', + executionOrder: 4, + isRunning: true, + iterationCurrent: 1, + iterationType: 'loop', + iterationContainerId: 'loop-1', + childWorkflowBlockId: 'workflow-1', + }) + + const startedAt = new Date().toISOString() + const endedAt = new Date(Date.now() + 20).toISOString() + const updateConsole = vi.fn() + reconcileFinalBlockLogs(updateConsole, 'wf-1', 'exec-1', [ + makeLog({ + blockId: 'workflow-1', + blockType: 'workflow', + executionOrder: 2, + childTraceSpans: [ + { + id: 'api-iter-0', + name: 'API 1', + type: 'api', + blockId: 'api-1', + executionOrder: 3, + loopId: 'loop-1', + iterationIndex: 0, + status: 'success', + duration: 10, + startTime: startedAt, + endTime: endedAt, + output: { result: 'first' }, + }, + { + id: 'api-iter-1', + name: 'API 1', + type: 'api', + blockId: 'api-1', + executionOrder: 4, + loopId: 'loop-1', + iterationIndex: 1, + status: 'error', + duration: 20, + startTime: startedAt, + endTime: endedAt, + output: { error: new Error('second failed') }, + }, + ], + }), + ]) + + expect(updateConsole).toHaveBeenCalledTimes(2) + expect(updateConsole.mock.calls[0]).toEqual([ + 'api-1', + expect.objectContaining({ + executionOrder: 3, + iterationCurrent: 0, + iterationType: 'loop', + iterationContainerId: 'loop-1', + replaceOutput: { result: 'first' }, + success: true, + }), + 'exec-1', + ]) + expect(updateConsole.mock.calls[1]).toEqual([ + 'api-1', + expect.objectContaining({ + executionOrder: 4, + iterationCurrent: 1, + iterationType: 'loop', + iterationContainerId: 'loop-1', + error: 'second failed', + success: false, + }), + 'exec-1', + ]) + }) + + it('recurses into nested workflow spans using the nested workflow instance id', () => { + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'workflow-1', + blockName: 'Workflow 1', + blockType: 'workflow', + executionId: 'exec-1', + executionOrder: 2, + success: true, + childWorkflowInstanceId: 'child-inst-1', + }) + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'nested-workflow', + blockName: 'Nested Workflow', + blockType: 'workflow', + executionId: 'exec-1', + executionOrder: 3, + isRunning: false, + childWorkflowBlockId: 'workflow-1', + childWorkflowInstanceId: 'nested-inst-1', + }) + terminalConsoleMockFns.mockAddConsole({ + workflowId: 'wf-1', + blockId: 'nested-api', + blockName: 'Nested API', + blockType: 'api', + executionId: 'exec-1', + executionOrder: 1, + isRunning: true, + childWorkflowBlockId: 'nested-workflow', + }) + + const startedAt = new Date().toISOString() + const endedAt = new Date(Date.now() + 20).toISOString() + const updateConsole = vi.fn() + reconcileFinalBlockLogs(updateConsole, 'wf-1', 'exec-1', [ + makeLog({ + blockId: 'workflow-1', + blockType: 'workflow', + executionOrder: 2, + childTraceSpans: [ + { + id: 'nested-workflow-span', + name: 'Nested Workflow', + type: 'workflow', + blockId: 'nested-workflow', + executionOrder: 3, + status: 'success', + duration: 10, + startTime: startedAt, + endTime: endedAt, + output: {}, + children: [ + { + id: 'nested-api-span', + name: 'Nested API', + type: 'api', + blockId: 'nested-api', + executionOrder: 1, + status: 'success', + duration: 10, + startTime: startedAt, + endTime: endedAt, + output: { ok: true }, + }, + ], + }, + ], + }), + ]) + + expect(updateConsole.mock.calls[1]).toEqual([ + 'nested-api', + expect.objectContaining({ + childWorkflowBlockId: 'nested-workflow', + success: true, + isRunning: false, + isCanceled: false, + }), + 'exec-1', + ]) + }) + it('is a no-op when finalBlockLogs is empty or executionId is missing', () => { const updateConsole = vi.fn() reconcileFinalBlockLogs(updateConsole, 'wf-1', 'exec-1', []) @@ -256,6 +728,7 @@ describe('workflow-execution-utils', () => { expect(calls[0]).toBe('cancel') expect(calls).toContain('add') + expect(cancelRunningEntries).toHaveBeenCalledWith('wf-1', 'exec-1') }) it('reconciles finalBlockLogs before sweeping running entries (Fix C)', () => { diff --git a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.ts b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.ts index 4872ab7c156..4237e6cf8cf 100644 --- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.ts +++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils.ts @@ -1,19 +1,31 @@ import { createLogger } from '@sim/logger' +import { toError } from '@sim/utils/errors' import { generateId } from '@sim/utils/id' +import type { TraceSpan } from '@/lib/logs/types' import type { + BlockChildWorkflowStartedData, BlockCompletedData, BlockErrorData, BlockStartedData, } from '@/lib/workflows/executor/execution-events' import type { BlockLog, BlockState, ExecutionResult, StreamingExecution } from '@/executor/types' import { stripCloneSuffixes } from '@/executor/utils/subflow-utils' -import { processSSEStream } from '@/hooks/use-execution-stream' +import { + processSSEStream, + SSEEventHandlerError, + SSEStreamInterruptedError, +} from '@/hooks/use-execution-stream' const logger = createLogger('workflow-execution-utils') import { useExecutionStore } from '@/stores/execution' import type { ConsoleEntry, ConsoleUpdate } from '@/stores/terminal' -import { saveExecutionPointer, useTerminalConsoleStore } from '@/stores/terminal' +import { + clearExecutionPointer, + consolePersistence, + saveExecutionPointer, + useTerminalConsoleStore, +} from '@/stores/terminal' import { useWorkflowRegistry } from '@/stores/workflows/registry/store' import { useWorkflowStore } from '@/stores/workflows/workflow/store' @@ -118,6 +130,8 @@ export interface BlockEventHandlerDeps { setEdgeRunStatus: (workflowId: string, edgeId: string, status: 'success' | 'error') => void } +type BlockChildWorkflowStartedUpdate = BlockChildWorkflowStartedData + /** * Creates block event handlers for SSE execution events. * Shared by the workflow execution hook and standalone execution utilities. @@ -140,6 +154,7 @@ export function createBlockEventHandlers( } = config const { addConsole, updateConsole, setActiveBlocks, setBlockRunStatus, setEdgeRunStatus } = deps + const pendingChildWorkflowStarts = new Map() const isStaleExecution = () => !!( @@ -178,6 +193,94 @@ export function createBlockEventHandlers( }), }) + const parentIterationsMatch = ( + left: ConsoleEntry['parentIterations'], + right: BlockStartedData['parentIterations'] + ) => { + if (!left?.length && !right?.length) return true + if (!left || !right || left.length !== right.length) return false + return left.every((entry, index) => { + const other = right[index] + return ( + entry.iterationCurrent === other.iterationCurrent && + entry.iterationTotal === other.iterationTotal && + entry.iterationType === other.iterationType && + entry.iterationContainerId === other.iterationContainerId + ) + }) + } + + type StartedIdentity = { + blockId: string + executionOrder?: number + iterationCurrent?: BlockStartedData['iterationCurrent'] + iterationTotal?: BlockStartedData['iterationTotal'] + iterationType?: BlockStartedData['iterationType'] + iterationContainerId?: BlockStartedData['iterationContainerId'] + childWorkflowBlockId?: BlockStartedData['childWorkflowBlockId'] + childWorkflowName?: BlockStartedData['childWorkflowName'] + parentIterations?: BlockStartedData['parentIterations'] + } + + const startedEntryKey = (data: StartedIdentity) => + JSON.stringify({ + blockId: data.blockId, + executionOrder: data.executionOrder, + iterationCurrent: data.iterationCurrent, + iterationTotal: data.iterationTotal, + iterationType: data.iterationType, + iterationContainerId: data.iterationContainerId, + childWorkflowBlockId: data.childWorkflowBlockId, + childWorkflowName: data.childWorkflowName, + parentIterations: data.parentIterations ?? [], + }) + + const matchesStartedIdentity = (entry: ConsoleEntry, data: StartedIdentity) => + entry.executionId === executionIdRef.current && + entry.blockId === data.blockId && + (data.executionOrder === undefined || entry.executionOrder === data.executionOrder) && + entry.iterationCurrent === data.iterationCurrent && + entry.iterationTotal === data.iterationTotal && + entry.iterationType === data.iterationType && + entry.iterationContainerId === data.iterationContainerId && + entry.childWorkflowBlockId === data.childWorkflowBlockId && + entry.childWorkflowName === data.childWorkflowName && + parentIterationsMatch(entry.parentIterations, data.parentIterations) + + const hasExistingStartedEntry = (data: StartedIdentity) => { + if (!workflowId) return false + return useTerminalConsoleStore + .getState() + .getWorkflowEntries(workflowId) + .some((entry) => matchesStartedIdentity(entry, data)) + } + + const applyChildWorkflowStart = (data: BlockChildWorkflowStartedUpdate) => { + updateConsole( + data.blockId, + { + childWorkflowInstanceId: data.childWorkflowInstanceId, + ...(data.iterationCurrent !== undefined && { iterationCurrent: data.iterationCurrent }), + ...(data.iterationTotal !== undefined && { iterationTotal: data.iterationTotal }), + ...(data.iterationType !== undefined && { iterationType: data.iterationType }), + ...(data.iterationContainerId !== undefined && { + iterationContainerId: data.iterationContainerId, + }), + ...(data.parentIterations !== undefined && { + parentIterations: data.parentIterations, + }), + ...(data.childWorkflowBlockId !== undefined && { + childWorkflowBlockId: data.childWorkflowBlockId, + }), + ...(data.childWorkflowName !== undefined && { + childWorkflowName: data.childWorkflowName, + }), + ...(data.executionOrder !== undefined && { executionOrder: data.executionOrder }), + }, + executionIdRef.current + ) + } + const createBlockLogEntry = ( data: BlockCompletedData | BlockErrorData, options: { success: boolean; output?: unknown; error?: string } @@ -237,6 +340,7 @@ export function createBlockEventHandlers( updateActiveBlocks(data.blockId, true) if (!includeStartConsoleEntry || !workflowId) return + if (hasExistingStartedEntry(data)) return const startedAt = new Date().toISOString() addConsole({ @@ -255,6 +359,13 @@ export function createBlockEventHandlers( isRunning: true, ...extractIterationFields(data), }) + + const pendingKey = startedEntryKey(data) + const pending = pendingChildWorkflowStarts.get(pendingKey) + if (pending) { + applyChildWorkflowStart(pending) + pendingChildWorkflowStarts.delete(pendingKey) + } } const onBlockCompleted = (data: BlockCompletedData) => { @@ -327,33 +438,19 @@ export function createBlockEventHandlers( updateConsoleErrorEntry(data) } - const onBlockChildWorkflowStarted = (data: { - blockId: string - childWorkflowInstanceId: string - iterationCurrent?: number - iterationContainerId?: string - executionOrder?: number - }) => { + const onBlockChildWorkflowStarted = (data: BlockChildWorkflowStartedUpdate) => { if (isStaleExecution()) return - updateConsole( - data.blockId, - { - childWorkflowInstanceId: data.childWorkflowInstanceId, - ...(data.iterationCurrent !== undefined && { iterationCurrent: data.iterationCurrent }), - ...(data.iterationContainerId !== undefined && { - iterationContainerId: data.iterationContainerId, - }), - ...(data.executionOrder !== undefined && { executionOrder: data.executionOrder }), - }, - executionIdRef.current - ) + applyChildWorkflowStart(data) + if (!hasExistingStartedEntry(data)) { + pendingChildWorkflowStarts.set(startedEntryKey(data), data) + } } return { onBlockStarted, onBlockCompleted, onBlockError, onBlockChildWorkflowStarted } } type AddConsoleFn = (entry: Omit) => ConsoleEntry | undefined -type CancelRunningEntriesFn = (workflowId: string) => void +type CancelRunningEntriesFn = (workflowId: string, executionId?: string) => void type UpdateConsoleFn = ( blockId: string, update: string | ConsoleUpdate, @@ -385,26 +482,155 @@ export function reconcileFinalBlockLogs( if (!finalBlockLogs?.length || !executionId) return for (const log of finalBlockLogs) { const entries = useTerminalConsoleStore.getState().getWorkflowEntries(workflowId) - const running = entries.find( - (e) => e.blockId === log.blockId && e.executionId === executionId && e.isRunning - ) - if (!running) continue - updateConsole( - log.blockId, - { - executionOrder: log.executionOrder, - replaceOutput: (log.output ?? {}) as Record, - ...(log.input ? { input: log.input } : {}), - success: log.success, - ...(log.error ? { error: log.error } : {}), - durationMs: log.durationMs, - startedAt: log.startedAt, - endedAt: log.endedAt, - isRunning: false, - }, - executionId + const matchesFinalLog = (entry: ConsoleEntry) => + entry.blockId === log.blockId && + entry.executionId === executionId && + entry.executionOrder === log.executionOrder + const matchingEntry = entries.find(matchesFinalLog) + const runningEntry = entries.find((entry) => matchesFinalLog(entry) && entry.isRunning) + if (runningEntry) { + updateConsole( + log.blockId, + { + executionOrder: log.executionOrder, + replaceOutput: (log.output ?? {}) as Record, + ...(log.input ? { input: log.input } : {}), + success: log.success, + ...(log.error ? { error: log.error } : {}), + durationMs: log.durationMs, + startedAt: log.startedAt, + endedAt: log.endedAt, + isRunning: false, + isCanceled: false, + }, + executionId + ) + } + + const childWorkflowInstanceId = matchingEntry?.childWorkflowInstanceId + if (childWorkflowInstanceId && log.childTraceSpans?.length) { + reconcileChildTraceSpans( + updateConsole, + workflowId, + log.blockId, + childWorkflowInstanceId, + executionId, + log.childTraceSpans + ) + } + } +} + +function reconcileChildTraceSpans( + updateConsole: UpdateConsoleFn, + workflowId: string, + childWorkflowBlockId: string, + childWorkflowInstanceId: string, + executionId: string, + spans: TraceSpan[] +): void { + for (const span of spans) { + const matchingEntry = span.blockId + ? findConsoleEntryForSpan(workflowId, executionId, childWorkflowBlockId, span) + : undefined + if (span.blockId) { + const errorMessage = normalizeSpanError(span.output?.error) + updateConsole( + span.blockId, + { + ...spanConsoleIdentity(span, childWorkflowBlockId), + replaceOutput: (span.output ?? {}) as Record, + success: span.status !== 'error', + ...(errorMessage !== undefined ? { error: errorMessage } : {}), + durationMs: span.duration, + startedAt: span.startTime, + endedAt: span.endTime, + isRunning: false, + isCanceled: false, + }, + executionId + ) + } + if (span.children?.length) { + reconcileChildTraceSpans( + updateConsole, + workflowId, + matchingEntry?.blockId ?? childWorkflowBlockId, + matchingEntry?.childWorkflowInstanceId ?? childWorkflowInstanceId, + executionId, + span.children + ) + } + } +} + +function spanConsoleIdentity(span: TraceSpan, childWorkflowBlockId: string): ConsoleUpdate { + const iterationContainerId = span.loopId ?? span.parallelId + const iterationType = span.loopId ? 'loop' : span.parallelId ? 'parallel' : undefined + return { + ...(span.executionOrder !== undefined && { executionOrder: span.executionOrder }), + ...(span.iterationIndex !== undefined && { iterationCurrent: span.iterationIndex }), + ...(iterationType !== undefined && { iterationType }), + ...(iterationContainerId !== undefined && { iterationContainerId }), + ...(span.parentIterations !== undefined && { parentIterations: span.parentIterations }), + childWorkflowBlockId, + } +} + +function findConsoleEntryForSpan( + workflowId: string, + executionId: string, + childWorkflowBlockId: string, + span: TraceSpan +): ConsoleEntry | undefined { + if (!span.blockId) return undefined + const identity = spanConsoleIdentity(span, childWorkflowBlockId) + return useTerminalConsoleStore + .getState() + .getWorkflowEntries(workflowId) + .find( + (entry) => + entry.blockId === span.blockId && + entry.executionId === executionId && + matchesConsoleIdentity(entry, identity) ) +} + +function matchesConsoleIdentity(entry: ConsoleEntry, identity: ConsoleUpdate): boolean { + if (identity.executionOrder !== undefined && entry.executionOrder !== identity.executionOrder) { + return false + } + if ( + identity.iterationCurrent !== undefined && + entry.iterationCurrent !== identity.iterationCurrent + ) { + return false + } + if ( + identity.iterationContainerId !== undefined && + entry.iterationContainerId !== identity.iterationContainerId + ) { + return false } + if ( + identity.childWorkflowBlockId !== undefined && + entry.childWorkflowBlockId !== identity.childWorkflowBlockId + ) { + return false + } + if ( + identity.childWorkflowInstanceId !== undefined && + entry.childWorkflowInstanceId !== undefined && + entry.childWorkflowInstanceId !== identity.childWorkflowInstanceId + ) { + return false + } + return true +} + +function normalizeSpanError(error: unknown): string | undefined { + if (error === undefined || error === null) return undefined + return typeof error === 'string' ? error : toError(error).message } export interface ExecutionTimingFields { @@ -500,7 +726,7 @@ export function handleExecutionErrorConsole( params.executionId, params.finalBlockLogs ) - deps.cancelRunningEntries(params.workflowId) + deps.cancelRunningEntries(params.workflowId, params.executionId) addExecutionErrorConsoleEntry(deps.addConsole, params) } @@ -585,7 +811,7 @@ export function handleExecutionCancelledConsole( params.executionId, params.finalBlockLogs ) - deps.cancelRunningEntries(params.workflowId) + deps.cancelRunningEntries(params.workflowId, params.executionId) addCancelledConsoleEntry(deps.addConsole, params) } @@ -600,6 +826,7 @@ export interface WorkflowExecutionOptions { useDraftState?: boolean stopAfterBlockId?: string abortSignal?: AbortSignal + preserveExecutionOnTerminal?: boolean /** For run_from_block / run_block: start from a specific block using cached state */ runFromBlock?: { startBlockId: string @@ -622,7 +849,9 @@ export async function executeWorkflowWithFullLogging( } const executionId = options.executionId || generateId() - const { addConsole, updateConsole, cancelRunningEntries } = useTerminalConsoleStore.getState() + const { addConsole, updateConsole, cancelRunningEntries, finishRunningEntries } = + useTerminalConsoleStore.getState() + const clearOnTerminal = options.preserveExecutionOnTerminal !== true const { setActiveBlocks, setBlockRunStatus, setEdgeRunStatus, setCurrentExecutionId } = useExecutionStore.getState() const wfId = targetWorkflowId @@ -632,6 +861,17 @@ export async function executeWorkflowWithFullLogging( const activeBlockRefCounts = new Map() const executionIdRef = { current: executionId } const accumulatedBlockLogs: BlockLog[] = [] + const isCurrentExecution = () => { + return useExecutionStore.getState().getCurrentExecutionId(wfId) === executionIdRef.current + } + const clearExecutionState = () => { + if (!isCurrentExecution()) return + setCurrentExecutionId(wfId, null) + clearExecutionPointer(wfId) + consolePersistence.executionEnded() + useExecutionStore.getState().setIsExecuting(wfId, false) + setActiveBlocks(wfId, new Set()) + } const blockHandlers = createBlockEventHandlers( { @@ -705,18 +945,24 @@ export async function executeWorkflowWithFullLogging( output: {}, logs: [], } + let executionFinished = false + let preserveExecutionForRecovery = false try { await processSSEStream( response.body.getReader(), { onEventId: (eventId) => { + if (executionFinished) return if (wfId && executionIdRef.current && eventId % 5 === 0) { - saveExecutionPointer({ - workflowId: wfId, - executionId: executionIdRef.current, - lastEventId: eventId, - }) + const executionId = executionIdRef.current + return consolePersistence.persist().then(() => + saveExecutionPointer({ + workflowId: wfId, + executionId, + lastEventId: eventId, + }) + ) } }, @@ -730,9 +976,10 @@ export async function executeWorkflowWithFullLogging( onBlockChildWorkflowStarted: blockHandlers.onBlockChildWorkflowStarted, onExecutionCompleted: (data) => { - setCurrentExecutionId(wfId, null) + if (!isCurrentExecution()) return + executionFinished = true reconcileFinalBlockLogs(updateConsole, wfId, executionIdRef.current, data.finalBlockLogs) - cancelRunningEntries(wfId) + finishRunningEntries(wfId, executionIdRef.current) executionResult = { success: data.success, output: data.output, @@ -743,10 +990,34 @@ export async function executeWorkflowWithFullLogging( endTime: data.endTime, }, } + if (clearOnTerminal) { + clearExecutionState() + } + }, + + onExecutionPaused: (data) => { + if (!isCurrentExecution()) return + executionFinished = true + reconcileFinalBlockLogs(updateConsole, wfId, executionIdRef.current, data.finalBlockLogs) + finishRunningEntries(wfId, executionIdRef.current) + executionResult = { + success: true, + output: data.output, + logs: accumulatedBlockLogs, + metadata: { + duration: data.duration, + startTime: data.startTime, + endTime: data.endTime, + }, + } + if (clearOnTerminal) { + clearExecutionState() + } }, onExecutionCancelled: (data) => { - setCurrentExecutionId(wfId, null) + if (!isCurrentExecution()) return + executionFinished = true executionResult = { success: false, output: {}, @@ -763,10 +1034,14 @@ export async function executeWorkflowWithFullLogging( finalBlockLogs: data?.finalBlockLogs, } ) + if (clearOnTerminal) { + clearExecutionState() + } }, onExecutionError: (data) => { - setCurrentExecutionId(wfId, null) + if (!isCurrentExecution()) return + executionFinished = true const errorMessage = data.error || 'Run failed' executionResult = { success: false, @@ -788,13 +1063,22 @@ export async function executeWorkflowWithFullLogging( finalBlockLogs: data.finalBlockLogs, } ) + if (clearOnTerminal) { + clearExecutionState() + } }, }, 'CopilotExecution' ) + } catch (error) { + if (error instanceof SSEEventHandlerError || error instanceof SSEStreamInterruptedError) { + preserveExecutionForRecovery = true + } + throw error } finally { - setCurrentExecutionId(wfId, null) - setActiveBlocks(wfId, new Set()) + if (!preserveExecutionForRecovery && clearOnTerminal) { + clearExecutionState() + } } return executionResult diff --git a/apps/sim/executor/execution/engine.test.ts b/apps/sim/executor/execution/engine.test.ts index 6147762d496..f0539ebf4f6 100644 --- a/apps/sim/executor/execution/engine.test.ts +++ b/apps/sim/executor/execution/engine.test.ts @@ -158,6 +158,27 @@ describe('ExecutionEngine', () => { expect(result.status).toBeUndefined() }) + it('should not fall back to starter blocks for terminal resume snapshots', async () => { + const startNode = createMockNode('start', 'starter') + const dag = createMockDAG([startNode]) + const context = createMockContext({ + metadata: { + executionId: 'test-execution', + startTime: new Date().toISOString(), + pendingBlocks: [], + resumeFromSnapshot: true, + }, + }) + const edgeManager = createMockEdgeManager() + const nodeOrchestrator = createMockNodeOrchestrator() + + const engine = new ExecutionEngine(context, dag, edgeManager, nodeOrchestrator) + const result = await engine.run() + + expect(result.success).toBe(true) + expect(nodeOrchestrator.executionCount).toBe(0) + }) + it('should execute all nodes in a multi-node workflow', async () => { const nodes = [ createMockNode('start', 'starter'), diff --git a/apps/sim/executor/execution/engine.ts b/apps/sim/executor/execution/engine.ts index 555d1ade2c7..82497858911 100644 --- a/apps/sim/executor/execution/engine.ts +++ b/apps/sim/executor/execution/engine.ts @@ -339,6 +339,11 @@ export class ExecutionEngine { return } + if (this.context.metadata.resumeFromSnapshot === true) { + this.execLogger.info('Resume snapshot has no downstream work to queue') + return + } + if (triggerBlockId) { this.addToQueue(triggerBlockId) return diff --git a/apps/sim/executor/execution/types.ts b/apps/sim/executor/execution/types.ts index 3c5130d8220..042ca72ac94 100644 --- a/apps/sim/executor/execution/types.ts +++ b/apps/sim/executor/execution/types.ts @@ -26,6 +26,7 @@ export interface ExecutionMetadata { enforceCredentialAccess?: boolean pendingBlocks?: string[] resumeFromSnapshot?: boolean + resumeTerminalNoop?: boolean credentialAccountUserId?: string workflowStateOverride?: { blocks: Record @@ -54,6 +55,7 @@ export interface SerializableExecutionState { activeExecutionPath: string[] pendingQueue?: string[] remainingEdges?: Edge[] + resumeTerminalNoop?: boolean dagIncomingEdges?: Record completedPauseContexts?: string[] } @@ -133,8 +135,9 @@ export interface ExecutionCallbacks { blockId: string, childWorkflowInstanceId: string, iterationContext?: IterationContext, - executionOrder?: number - ) => void + executionOrder?: number, + childWorkflowContext?: ChildWorkflowContext + ) => Promise } export interface ContextExtensions { @@ -200,8 +203,9 @@ export interface ContextExtensions { blockId: string, childWorkflowInstanceId: string, iterationContext?: IterationContext, - executionOrder?: number - ) => void + executionOrder?: number, + childWorkflowContext?: ChildWorkflowContext + ) => Promise /** * Run-from-block configuration. When provided, executor runs in partial diff --git a/apps/sim/executor/handlers/workflow/workflow-handler.ts b/apps/sim/executor/handlers/workflow/workflow-handler.ts index c99b907ea5f..e2f329c206a 100644 --- a/apps/sim/executor/handlers/workflow/workflow-handler.ts +++ b/apps/sim/executor/handlers/workflow/workflow-handler.ts @@ -156,11 +156,12 @@ export class WorkflowBlockHandler implements BlockHandler { ? (nodeMetadata.originalBlockId ?? nodeMetadata.nodeId) : block.id const iterationContext = nodeMetadata ? getIterationContext(ctx, nodeMetadata) : undefined - ctx.onChildWorkflowInstanceReady?.( + await ctx.onChildWorkflowInstanceReady?.( effectiveBlockId, instanceId, iterationContext, - nodeMetadata?.executionOrder + nodeMetadata?.executionOrder, + ctx.childWorkflowContext ) } diff --git a/apps/sim/executor/types.ts b/apps/sim/executor/types.ts index a220d913ecc..8195f385a77 100644 --- a/apps/sim/executor/types.ts +++ b/apps/sim/executor/types.ts @@ -268,6 +268,7 @@ export interface ExecutionMetadata { triggerBlockId?: string useDraftState?: boolean resumeFromSnapshot?: boolean + resumeTerminalNoop?: boolean } export interface BlockState { @@ -385,8 +386,9 @@ export interface ExecutionContext { blockId: string, childWorkflowInstanceId: string, iterationContext?: IterationContext, - executionOrder?: number - ) => void + executionOrder?: number, + childWorkflowContext?: ChildWorkflowContext + ) => Promise /** * AbortSignal for cancellation support. diff --git a/apps/sim/hooks/use-execution-stream.test.ts b/apps/sim/hooks/use-execution-stream.test.ts new file mode 100644 index 00000000000..da52635ff99 --- /dev/null +++ b/apps/sim/hooks/use-execution-stream.test.ts @@ -0,0 +1,87 @@ +/** + * @vitest-environment node + */ +import { describe, expect, it, vi } from 'vitest' +import type { ExecutionEvent } from '@/lib/workflows/executor/execution-events' +import { processSSEStream } from '@/hooks/use-execution-stream' + +function streamEvents(events: ExecutionEvent[]): ReadableStream { + const encoder = new TextEncoder() + return new ReadableStream({ + start(controller) { + for (const event of events) { + controller.enqueue(encoder.encode(`data: ${JSON.stringify(event)}\n\n`)) + } + controller.enqueue(encoder.encode('data: [DONE]\n\n')) + controller.close() + }, + }) +} + +describe('processSSEStream', () => { + it('acknowledges event ids only after the matching handler completes', async () => { + const order: string[] = [] + const event: ExecutionEvent = { + type: 'block:started', + eventId: 5, + timestamp: new Date().toISOString(), + executionId: 'exec-1', + workflowId: 'wf-1', + data: { + blockId: 'block-1', + blockName: 'Block 1', + blockType: 'function', + executionOrder: 1, + }, + } + + await processSSEStream( + streamEvents([event]).getReader(), + { + onBlockStarted: async () => { + order.push('handler:start') + await Promise.resolve() + order.push('handler:end') + }, + onEventId: vi.fn(async () => { + order.push('event-id') + }), + }, + 'test' + ) + + expect(order).toEqual(['handler:start', 'handler:end', 'event-id']) + }) + + it('propagates callback failures without acknowledging the event id', async () => { + const event: ExecutionEvent = { + type: 'block:started', + eventId: 6, + timestamp: new Date().toISOString(), + executionId: 'exec-1', + workflowId: 'wf-1', + data: { + blockId: 'block-1', + blockName: 'Block 1', + blockType: 'function', + executionOrder: 1, + }, + } + const onEventId = vi.fn() + + await expect( + processSSEStream( + streamEvents([event]).getReader(), + { + onBlockStarted: async () => { + throw new Error('handler failed') + }, + onEventId, + }, + 'test' + ) + ).rejects.toThrow('handler failed') + + expect(onEventId).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/hooks/use-execution-stream.ts b/apps/sim/hooks/use-execution-stream.ts index a05fce82cef..b45a8550ba6 100644 --- a/apps/sim/hooks/use-execution-stream.ts +++ b/apps/sim/hooks/use-execution-stream.ts @@ -32,12 +32,40 @@ export function isExecutionStreamHttpError(error: unknown): error is ExecutionSt return error instanceof ExecutionStreamHttpError } +export class SSEEventHandlerError extends Error { + constructor( + message: string, + public readonly eventType: string, + public readonly eventId: number | undefined, + public readonly executionId: string | undefined, + public readonly originalError: unknown + ) { + super(message) + this.name = 'SSEEventHandlerError' + } +} + +export class SSEStreamInterruptedError extends Error { + constructor( + message: string, + public readonly executionId: string | undefined, + public readonly originalError: unknown + ) { + super(message) + this.name = 'SSEStreamInterruptedError' + } +} + /** * Detects errors caused by the browser killing a fetch (page refresh, navigation, tab close). * These should be treated as clean disconnects, not execution errors. */ function isClientDisconnectError(error: any): boolean { - if (error.name === 'AbortError') return true + return error.name === 'AbortError' +} + +function isRecoverableStreamError(error: any): boolean { + if (isClientDisconnectError(error)) return false const msg = (error.message ?? '').toLowerCase() return ( msg.includes('network error') || msg.includes('failed to fetch') || msg.includes('load failed') @@ -75,52 +103,69 @@ export async function processSSEStream( continue } + let event: ExecutionEvent try { - const event = JSON.parse(data) as ExecutionEvent - - if (event.eventId != null) { - callbacks.onEventId?.(event.eventId) - } + event = JSON.parse(data) as ExecutionEvent + } catch (error) { + logger.error('Failed to parse SSE event:', error, { data }) + continue + } + try { switch (event.type) { case 'execution:started': - callbacks.onExecutionStarted?.(event.data) + await callbacks.onExecutionStarted?.(event.data) break case 'execution:completed': - callbacks.onExecutionCompleted?.(event.data) + await callbacks.onExecutionCompleted?.(event.data) break case 'execution:paused': - callbacks.onExecutionPaused?.(event.data) + await callbacks.onExecutionPaused?.(event.data) break case 'execution:error': - callbacks.onExecutionError?.(event.data) + await callbacks.onExecutionError?.(event.data) break case 'execution:cancelled': - callbacks.onExecutionCancelled?.(event.data) + await callbacks.onExecutionCancelled?.(event.data) break case 'block:started': - callbacks.onBlockStarted?.(event.data) + await callbacks.onBlockStarted?.(event.data) break case 'block:completed': - callbacks.onBlockCompleted?.(event.data) + await callbacks.onBlockCompleted?.(event.data) break case 'block:error': - callbacks.onBlockError?.(event.data) + await callbacks.onBlockError?.(event.data) break case 'block:childWorkflowStarted': - callbacks.onBlockChildWorkflowStarted?.(event.data) + await callbacks.onBlockChildWorkflowStarted?.(event.data) break case 'stream:chunk': - callbacks.onStreamChunk?.(event.data) + await callbacks.onStreamChunk?.(event.data) break case 'stream:done': - callbacks.onStreamDone?.(event.data) + await callbacks.onStreamDone?.(event.data) break default: logger.warn('Unknown event type:', (event as any).type) } + + if (event.eventId != null) { + await callbacks.onEventId?.(event.eventId) + } } catch (error) { - logger.error('Failed to parse SSE event:', error, { data }) + logger.error('SSE event handler failed:', error, { + eventType: event.type, + eventId: event.eventId, + }) + const message = error instanceof Error ? error.message : String(error) + throw new SSEEventHandlerError( + message, + event.type, + event.eventId, + event.executionId, + error + ) } } } @@ -130,18 +175,18 @@ export async function processSSEStream( } export interface ExecutionStreamCallbacks { - onExecutionStarted?: (data: ExecutionStartedData) => void - onExecutionCompleted?: (data: ExecutionCompletedData) => void - onExecutionPaused?: (data: ExecutionPausedData) => void - onExecutionError?: (data: ExecutionErrorData) => void - onExecutionCancelled?: (data: ExecutionCancelledData) => void - onBlockStarted?: (data: BlockStartedData) => void - onBlockCompleted?: (data: BlockCompletedData) => void - onBlockError?: (data: BlockErrorData) => void - onBlockChildWorkflowStarted?: (data: BlockChildWorkflowStartedData) => void - onStreamChunk?: (data: StreamChunkData) => void - onStreamDone?: (data: StreamDoneData) => void - onEventId?: (eventId: number) => void + onExecutionStarted?: (data: ExecutionStartedData) => void | Promise + onExecutionCompleted?: (data: ExecutionCompletedData) => void | Promise + onExecutionPaused?: (data: ExecutionPausedData) => void | Promise + onExecutionError?: (data: ExecutionErrorData) => void | Promise + onExecutionCancelled?: (data: ExecutionCancelledData) => void | Promise + onBlockStarted?: (data: BlockStartedData) => void | Promise + onBlockCompleted?: (data: BlockCompletedData) => void | Promise + onBlockError?: (data: BlockErrorData) => void | Promise + onBlockChildWorkflowStarted?: (data: BlockChildWorkflowStartedData) => void | Promise + onStreamChunk?: (data: StreamChunkData) => void | Promise + onStreamDone?: (data: StreamDoneData) => void | Promise + onEventId?: (eventId: number) => void | Promise } export interface ExecuteStreamOptions { @@ -191,6 +236,30 @@ export interface ReconnectStreamOptions { */ const sharedAbortControllers = new Map() +function executeStreamKey(workflowId: string): string { + return `${workflowId}:execute` +} + +function reconnectStreamKey(workflowId: string, executionId: string): string { + return `${workflowId}:reconnect:${executionId}` +} + +function abortStream(key: string): void { + const controller = sharedAbortControllers.get(key) + if (!controller) return + controller.abort() + sharedAbortControllers.delete(key) +} + +function abortWorkflowStreams(workflowId: string): void { + const prefix = `${workflowId}:` + for (const [key, controller] of sharedAbortControllers) { + if (!key.startsWith(prefix)) continue + controller.abort() + sharedAbortControllers.delete(key) + } +} + /** * Hook for executing workflows via server-side SSE streaming. * Supports concurrent executions via per-workflow AbortController maps. @@ -199,13 +268,12 @@ export function useExecutionStream() { const execute = useCallback(async (options: ExecuteStreamOptions) => { const { workflowId, callbacks = {}, onExecutionId, ...payload } = options - const existing = sharedAbortControllers.get(workflowId) - if (existing) { - existing.abort() - } + abortWorkflowStreams(workflowId) const abortController = new AbortController() - sharedAbortControllers.set(workflowId, abortController) + const streamKey = executeStreamKey(workflowId) + sharedAbortControllers.set(streamKey, abortController) + let serverExecutionId: string | undefined try { // boundary-raw-fetch: workflow execute endpoint returns an SSE stream consumed via response.body.getReader() and processSSEStream; also reads the X-Execution-Id response header @@ -242,7 +310,7 @@ export function useExecutionStream() { throw new Error('No response body') } - const serverExecutionId = response.headers.get('X-Execution-Id') + serverExecutionId = response.headers.get('X-Execution-Id') ?? undefined if (serverExecutionId) { onExecutionId?.(serverExecutionId) } @@ -254,15 +322,28 @@ export function useExecutionStream() { logger.info('Execution stream disconnected (page unload or abort)') return } + if (isRecoverableStreamError(error)) { + logger.warn('Execution stream interrupted; preserving execution for reconnect', { + executionId: serverExecutionId, + error: error.message, + }) + throw new SSEStreamInterruptedError( + 'Execution stream interrupted before a terminal event was received', + serverExecutionId, + error + ) + } logger.error('Execution stream error:', error) - callbacks.onExecutionError?.({ - error: error.message || 'Unknown error', - duration: 0, - }) + if (!(error instanceof SSEEventHandlerError)) { + await callbacks.onExecutionError?.({ + error: error.message || 'Unknown error', + duration: 0, + }) + } throw error } finally { - if (sharedAbortControllers.get(workflowId) === abortController) { - sharedAbortControllers.delete(workflowId) + if (sharedAbortControllers.get(streamKey) === abortController) { + sharedAbortControllers.delete(streamKey) } } }, []) @@ -277,13 +358,12 @@ export function useExecutionStream() { callbacks = {}, } = options - const existing = sharedAbortControllers.get(workflowId) - if (existing) { - existing.abort() - } + abortWorkflowStreams(workflowId) const abortController = new AbortController() - sharedAbortControllers.set(workflowId, abortController) + const streamKey = executeStreamKey(workflowId) + sharedAbortControllers.set(streamKey, abortController) + let serverExecutionId: string | undefined try { // boundary-raw-fetch: run-from-block endpoint returns an SSE stream consumed via response.body.getReader() and processSSEStream; also reads the X-Execution-Id response header @@ -324,7 +404,7 @@ export function useExecutionStream() { throw new Error('No response body') } - const serverExecutionId = response.headers.get('X-Execution-Id') + serverExecutionId = response.headers.get('X-Execution-Id') ?? undefined if (serverExecutionId) { onExecutionId?.(serverExecutionId) } @@ -336,15 +416,28 @@ export function useExecutionStream() { logger.info('Run-from-block stream disconnected (page unload or abort)') return } + if (isRecoverableStreamError(error)) { + logger.warn('Run-from-block stream interrupted; preserving execution for reconnect', { + executionId: serverExecutionId, + error: error.message, + }) + throw new SSEStreamInterruptedError( + 'Run-from-block stream interrupted before a terminal event was received', + serverExecutionId, + error + ) + } logger.error('Run-from-block execution error:', error) - callbacks.onExecutionError?.({ - error: error.message || 'Unknown error', - duration: 0, - }) + if (!(error instanceof SSEEventHandlerError)) { + await callbacks.onExecutionError?.({ + error: error.message || 'Unknown error', + duration: 0, + }) + } throw error } finally { - if (sharedAbortControllers.get(workflowId) === abortController) { - sharedAbortControllers.delete(workflowId) + if (sharedAbortControllers.get(streamKey) === abortController) { + sharedAbortControllers.delete(streamKey) } } }, []) @@ -352,13 +445,10 @@ export function useExecutionStream() { const reconnect = useCallback(async (options: ReconnectStreamOptions) => { const { workflowId, executionId, fromEventId = 0, callbacks = {} } = options - const existing = sharedAbortControllers.get(workflowId) - if (existing) { - existing.abort() - } - const abortController = new AbortController() - sharedAbortControllers.set(workflowId, abortController) + const streamKey = reconnectStreamKey(workflowId, executionId) + abortStream(streamKey) + sharedAbortControllers.set(streamKey, abortController) try { // boundary-raw-fetch: execution reconnect endpoint returns an SSE stream consumed via response.body.getReader() and processSSEStream const response = await fetch( @@ -376,19 +466,15 @@ export function useExecutionStream() { logger.error('Reconnection stream error:', error) throw error } finally { - if (sharedAbortControllers.get(workflowId) === abortController) { - sharedAbortControllers.delete(workflowId) + if (sharedAbortControllers.get(streamKey) === abortController) { + sharedAbortControllers.delete(streamKey) } } }, []) const cancel = useCallback((workflowId?: string) => { if (workflowId) { - const controller = sharedAbortControllers.get(workflowId) - if (controller) { - controller.abort() - sharedAbortControllers.delete(workflowId) - } + abortWorkflowStreams(workflowId) } else { for (const [, controller] of sharedAbortControllers) { controller.abort() @@ -397,10 +483,20 @@ export function useExecutionStream() { } }, []) + const cancelReconnect = useCallback((workflowId: string, executionId: string) => { + abortStream(reconnectStreamKey(workflowId, executionId)) + }, []) + + const cancelExecute = useCallback((workflowId: string) => { + abortStream(executeStreamKey(workflowId)) + }, []) + return { execute, executeFromBlock, reconnect, cancel, + cancelReconnect, + cancelExecute, } } diff --git a/apps/sim/lib/copilot/tools/client/run-tool-execution.test.ts b/apps/sim/lib/copilot/tools/client/run-tool-execution.test.ts index 1de7a8ccc1a..ac5fff66d70 100644 --- a/apps/sim/lib/copilot/tools/client/run-tool-execution.test.ts +++ b/apps/sim/lib/copilot/tools/client/run-tool-execution.test.ts @@ -9,6 +9,8 @@ const { executeWorkflowWithFullLogging, getWorkflowEntries, loadExecutionPointer, + MockSSEEventHandlerError, + MockSSEStreamInterruptedError, saveExecutionPointer, setActiveWorkflow, } = vi.hoisted(() => ({ @@ -16,12 +18,32 @@ const { executeWorkflowWithFullLogging: vi.fn(), getWorkflowEntries: vi.fn(() => []), loadExecutionPointer: vi.fn(), + MockSSEEventHandlerError: class SSEEventHandlerError extends Error { + executionId?: string + + constructor(message: string, executionId?: string) { + super(message) + this.name = 'SSEEventHandlerError' + this.executionId = executionId + } + }, + MockSSEStreamInterruptedError: class SSEStreamInterruptedError extends Error { + executionId?: string + + constructor(message: string, executionId?: string) { + super(message) + this.name = 'SSEStreamInterruptedError' + this.executionId = executionId + } + }, saveExecutionPointer: vi.fn(), setActiveWorkflow: vi.fn(), })) const setIsExecuting = vi.fn() +const setActiveBlocks = vi.fn() const setCurrentExecutionId = vi.fn() +const getCurrentExecutionId = vi.fn() const getWorkflowExecution = vi.fn(() => ({ isExecuting: false })) vi.mock('@/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils', () => ({ @@ -31,13 +53,20 @@ vi.mock('@/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-u vi.mock('@/stores/execution/store', () => ({ useExecutionStore: { getState: () => ({ + getCurrentExecutionId, getWorkflowExecution, + setActiveBlocks, setIsExecuting, setCurrentExecutionId, }), }, })) +vi.mock('@/hooks/use-execution-stream', () => ({ + SSEEventHandlerError: MockSSEEventHandlerError, + SSEStreamInterruptedError: MockSSEStreamInterruptedError, +})) + vi.mock('@/stores/workflows/registry/store', () => ({ useWorkflowRegistry: { getState: () => ({ @@ -73,6 +102,7 @@ import { describe('run tool execution cancellation', () => { beforeEach(() => { vi.clearAllMocks() + getCurrentExecutionId.mockReturnValue(null) getWorkflowEntries.mockReturnValue([]) loadExecutionPointer.mockResolvedValue(null) vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ ok: true })) @@ -144,7 +174,9 @@ describe('run tool execution cancellation', () => { ) }) - it('binds a recovered execution without starting a new workflow run', async () => { + it('treats a tab-local execution pointer as handled in background', async () => { + const fetchMock = vi.fn().mockResolvedValue({ ok: true }) + vi.stubGlobal('fetch', fetchMock) loadExecutionPointer.mockResolvedValueOnce({ workflowId: 'wf-1', executionId: 'exec-existing', @@ -153,14 +185,67 @@ describe('run tool execution cancellation', () => { await expect(bindRunToolToExecution('tool-3', 'wf-1')).resolves.toBe(true) - expect(setActiveWorkflow).toHaveBeenCalledWith('wf-1') - expect(setIsExecuting).toHaveBeenCalledWith('wf-1', true) - expect(setCurrentExecutionId).toHaveBeenCalledWith('wf-1', 'exec-existing') - expect(saveExecutionPointer).toHaveBeenCalledWith({ - workflowId: 'wf-1', - executionId: 'exec-existing', - lastEventId: 7, - }) + expect(setActiveWorkflow).not.toHaveBeenCalled() + expect(setIsExecuting).not.toHaveBeenCalled() + expect(setCurrentExecutionId).not.toHaveBeenCalled() + expect(saveExecutionPointer).not.toHaveBeenCalled() expect(executeWorkflowWithFullLogging).not.toHaveBeenCalled() + expect(fetchMock).toHaveBeenCalledWith( + '/api/copilot/confirm', + expect.objectContaining({ + method: 'POST', + body: expect.stringContaining('"status":"background"'), + }) + ) + }) + + it('does not recover from shared console rows without a tab-local pointer', async () => { + loadExecutionPointer.mockResolvedValueOnce(null) + getWorkflowEntries.mockReturnValueOnce([ + { + workflowId: 'wf-1', + executionId: 'exec-shared', + isRunning: true, + startedAt: new Date().toISOString(), + }, + ]) + + await expect(bindRunToolToExecution('tool-4', 'wf-1')).resolves.toBe(false) + + expect(setActiveWorkflow).not.toHaveBeenCalled() + expect(setIsExecuting).not.toHaveBeenCalled() + expect(setCurrentExecutionId).not.toHaveBeenCalled() + expect(saveExecutionPointer).not.toHaveBeenCalled() + }) + + it('reports local stream handler failures as background instead of workflow errors', async () => { + const fetchMock = vi.fn().mockResolvedValue({ ok: true }) + vi.stubGlobal('fetch', fetchMock) + getCurrentExecutionId.mockImplementation( + () => saveExecutionPointer.mock.calls[0]?.[0]?.executionId ?? null + ) + executeWorkflowWithFullLogging.mockRejectedValueOnce( + new MockSSEEventHandlerError('handler failed', 'exec-1') + ) + + executeRunToolOnClient('tool-5', 'run_workflow', { workflowId: 'wf-1' }) + + await vi.waitFor(() => { + expect(fetchMock).toHaveBeenCalledWith( + '/api/copilot/confirm', + expect.objectContaining({ + method: 'POST', + body: expect.stringContaining('"status":"background"'), + }) + ) + }) + expect(clearExecutionPointer).not.toHaveBeenCalled() + expect(setIsExecuting).toHaveBeenCalledWith('wf-1', false) + expect(fetchMock).not.toHaveBeenCalledWith( + '/api/copilot/confirm', + expect.objectContaining({ + body: expect.stringContaining('"status":"error"'), + }) + ) }) }) diff --git a/apps/sim/lib/copilot/tools/client/run-tool-execution.ts b/apps/sim/lib/copilot/tools/client/run-tool-execution.ts index 0dcd3786597..18104b415fe 100644 --- a/apps/sim/lib/copilot/tools/client/run-tool-execution.ts +++ b/apps/sim/lib/copilot/tools/client/run-tool-execution.ts @@ -1,7 +1,12 @@ import { createLogger } from '@sim/logger' import { toError } from '@sim/utils/errors' +import { sleep } from '@sim/utils/helpers' import { generateId } from '@sim/utils/id' -import type { AsyncCompletionData } from '@/lib/copilot/async-runs/lifecycle' +import { + ASYNC_TOOL_CONFIRMATION_STATUS, + type AsyncCompletionData, + type AsyncConfirmationStatus, +} from '@/lib/copilot/async-runs/lifecycle' import { COPILOT_CONFIRM_API_PATH } from '@/lib/copilot/constants' import { MothershipStreamV1ToolOutcome } from '@/lib/copilot/generated/mothership-stream-v1' import { @@ -11,14 +16,13 @@ import { } from '@/lib/copilot/generated/tool-catalog-v1' import { traceparentHeader } from '@/lib/copilot/tools/client/trace-context' import { executeWorkflowWithFullLogging } from '@/app/workspace/[workspaceId]/w/[workflowId]/utils/workflow-execution-utils' +import { SSEEventHandlerError, SSEStreamInterruptedError } from '@/hooks/use-execution-stream' import { useExecutionStore } from '@/stores/execution/store' import { clearExecutionPointer, consolePersistence, - type ExecutionPointer, loadExecutionPointer, saveExecutionPointer, - useTerminalConsoleStore, } from '@/stores/terminal' import { useWorkflowRegistry } from '@/stores/workflows/registry/store' @@ -26,6 +30,20 @@ const logger = createLogger('CopilotRunToolExecution') const activeRunToolByWorkflowId = new Map() const activeRunAbortByWorkflowId = new Map() const manuallyStoppedToolCallIds = new Set() +const PENDING_COMPLETION_STORAGE_PREFIX = 'sim:copilot:run-tool-completion:' + +interface PendingCompletionReport { + status: AsyncConfirmationStatus + message?: string + data?: AsyncCompletionData +} + +class CompletionReportError extends Error { + constructor(message: string) { + super(message) + this.name = 'CompletionReportError' + } +} function isRecord(value: unknown): value is Record { return Boolean(value) && typeof value === 'object' && !Array.isArray(value) @@ -47,55 +65,62 @@ function resolveTriggerBlockId(params: Record): string | undefi : undefined } -function getRunningExecutionPointer(workflowId: string): ExecutionPointer | null { - const runningEntries = useTerminalConsoleStore - .getState() - .getWorkflowEntries(workflowId) - .filter((entry) => entry.isRunning && entry.executionId) +function pendingCompletionStorageKey(toolCallId: string): string { + return `${PENDING_COMPLETION_STORAGE_PREFIX}${toolCallId}` +} - if (runningEntries.length === 0) { - return null +function savePendingCompletionReport(toolCallId: string, report: PendingCompletionReport): void { + if (typeof window === 'undefined') return + try { + window.sessionStorage.setItem(pendingCompletionStorageKey(toolCallId), JSON.stringify(report)) + } catch (error) { + logger.warn('[RunTool] Failed to persist pending completion report', { + toolCallId, + error: toError(error).message, + }) } +} - const latestEntry = [...runningEntries].sort((a, b) => { - const aStartedAt = a.startedAt ? new Date(a.startedAt).getTime() : 0 - const bStartedAt = b.startedAt ? new Date(b.startedAt).getTime() : 0 - return bStartedAt - aStartedAt - })[0] - - const executionId = latestEntry?.executionId - if (!executionId) { +function loadPendingCompletionReport(toolCallId: string): PendingCompletionReport | null { + if (typeof window === 'undefined') return null + try { + const raw = window.sessionStorage.getItem(pendingCompletionStorageKey(toolCallId)) + if (!raw) return null + const parsed = JSON.parse(raw) as PendingCompletionReport + return parsed?.status ? parsed : null + } catch (error) { + logger.warn('[RunTool] Failed to load pending completion report', { + toolCallId, + error: toError(error).message, + }) return null } - - return { - workflowId, - executionId, - lastEventId: 0, - } } -async function findRecoverableExecutionPointer( - workflowId: string -): Promise { - const pointer = await loadExecutionPointer(workflowId) - if (pointer?.executionId) { - return pointer +function clearPendingCompletionReport(toolCallId: string): void { + if (typeof window === 'undefined') return + try { + window.sessionStorage.removeItem(pendingCompletionStorageKey(toolCallId)) + } catch (error) { + logger.warn('[RunTool] Failed to clear pending completion report', { + toolCallId, + error: toError(error).message, + }) } - - return getRunningExecutionPointer(workflowId) } export async function bindRunToolToExecution( toolCallId: string, workflowId: string ): Promise { - const executionPointer = await findRecoverableExecutionPointer(workflowId) - if (!executionPointer) { - return false - } - const existingToolCallId = activeRunToolByWorkflowId.get(workflowId) + if (existingToolCallId === toolCallId) { + logger.info('[RunTool] Recovery skipped: run tool is already active in this tab', { + workflowId, + toolCallId, + }) + return true + } if (existingToolCallId && existingToolCallId !== toolCallId) { logger.warn('[RunTool] Recovery skipped: another run tool is already active', { workflowId, @@ -105,20 +130,60 @@ export async function bindRunToolToExecution( return false } - useWorkflowRegistry.getState().setActiveWorkflow(workflowId) - activeRunToolByWorkflowId.set(workflowId, toolCallId) - - const { setCurrentExecutionId, setIsExecuting } = useExecutionStore.getState() - setIsExecuting(workflowId, true) - setCurrentExecutionId(workflowId, executionPointer.executionId) - saveExecutionPointer(executionPointer) + const pointer = await loadExecutionPointer(workflowId).catch(() => null) + if (!pointer?.executionId) { + logger.info('[RunTool] Recovery skipped: no tab-local execution pointer', { + workflowId, + toolCallId, + }) + return false + } - logger.info('[RunTool] Reattached tool call to existing workflow execution', { + logger.info('[RunTool] Recovery moved to background for existing execution pointer', { workflowId, toolCallId, - executionId: executionPointer.executionId, - lastEventId: executionPointer.lastEventId, + executionId: pointer.executionId, }) + const pendingCompletion = loadPendingCompletionReport(toolCallId) + if (pendingCompletion) { + try { + await reportCompletion( + toolCallId, + pendingCompletion.status, + pendingCompletion.message, + pendingCompletion.data + ) + clearPendingCompletionReport(toolCallId) + } catch (error) { + logger.warn('[RunTool] Failed to report recovered terminal completion', { + workflowId, + toolCallId, + executionId: pointer.executionId, + error: toError(error).message, + }) + } + return true + } + + try { + await reportCompletion( + toolCallId, + ASYNC_TOOL_CONFIRMATION_STATUS.background, + 'Client recovered an existing workflow execution; continuing in background.', + { + workflowId, + executionId: pointer.executionId, + lastEventId: pointer.lastEventId, + } + ) + } catch (error) { + logger.warn('[RunTool] Failed to report recovered execution as background', { + workflowId, + toolCallId, + executionId: pointer.executionId, + error: toError(error).message, + }) + } return true } @@ -292,6 +357,15 @@ async function doExecuteRunTool( setCurrentExecutionId(targetWorkflowId, executionId) saveExecutionPointer({ workflowId: targetWorkflowId, executionId, lastEventId: 0 }) const executionStartTime = new Date().toISOString() + const releaseVisibleExecutionForBackground = () => { + const { setCurrentExecutionId: clearExecId, setActiveBlocks } = useExecutionStore.getState() + if (activeRunToolByWorkflowId.get(targetWorkflowId) === toolCallId) { + clearExecId(targetWorkflowId, null) + consolePersistence.executionEnded() + setIsExecuting(targetWorkflowId, false) + setActiveBlocks(targetWorkflowId, new Set()) + } + } const onPageHide = () => { if (manuallyStoppedToolCallIds.has(toolCallId)) return @@ -325,6 +399,8 @@ async function doExecuteRunTool( runFromBlock: runFromBlock ? { startBlockId: runFromBlock.startBlockId } : undefined, }) + let leaveExecutionRecoverable = false + try { const result = await executeWorkflowWithFullLogging({ workflowId: targetWorkflowId, @@ -336,6 +412,7 @@ async function doExecuteRunTool( stopAfterBlockId, runFromBlock, abortSignal: abortController.signal, + preserveExecutionOnTerminal: true, }) // Determine success (same logic as staging's RunWorkflowClientTool) @@ -368,21 +445,35 @@ async function doExecuteRunTool( }) } else if (succeeded) { logger.info('[RunTool] Workflow execution succeeded', { toolCallId, toolName }) + const pendingCompletion = { + status: MothershipStreamV1ToolOutcome.success, + message: `Workflow execution completed. Started at: ${executionStartTime}`, + data: buildResultData(result), + } + savePendingCompletionReport(toolCallId, pendingCompletion) await reportCompletion( toolCallId, - MothershipStreamV1ToolOutcome.success, - `Workflow execution completed. Started at: ${executionStartTime}`, - buildResultData(result) + pendingCompletion.status, + pendingCompletion.message, + pendingCompletion.data ) + clearPendingCompletionReport(toolCallId) } else { const msg = errorMessage || 'Workflow execution failed' logger.error('[RunTool] Workflow execution failed', { toolCallId, toolName, error: msg }) + const pendingCompletion = { + status: MothershipStreamV1ToolOutcome.error, + message: msg, + data: buildResultData(result), + } + savePendingCompletionReport(toolCallId, pendingCompletion) await reportCompletion( toolCallId, - MothershipStreamV1ToolOutcome.error, - msg, - buildResultData(result) + pendingCompletion.status, + pendingCompletion.message, + pendingCompletion.data ) + clearPendingCompletionReport(toolCallId) } } catch (err) { if (manuallyStoppedToolCallIds.has(toolCallId)) { @@ -392,6 +483,35 @@ async function doExecuteRunTool( }) } else { const msg = toError(err).message + if (err instanceof SSEEventHandlerError || err instanceof SSEStreamInterruptedError) { + leaveExecutionRecoverable = true + logger.warn( + '[RunTool] Execution stream interrupted; leaving workflow execution in background', + { + toolCallId, + toolName, + executionId: err.executionId, + error: msg, + } + ) + releaseVisibleExecutionForBackground() + await reportCompletion( + toolCallId, + ASYNC_TOOL_CONFIRMATION_STATUS.background, + 'Client lost local stream processing; workflow execution may still be continuing server-side.' + ) + return + } + if (err instanceof CompletionReportError) { + leaveExecutionRecoverable = true + logger.warn('[RunTool] Completion report failed; leaving workflow execution recoverable', { + toolCallId, + toolName, + error: msg, + }) + releaseVisibleExecutionForBackground() + return + } logger.error('[RunTool] Workflow execution threw', { toolCallId, toolName, error: msg }) await reportCompletion(toolCallId, MothershipStreamV1ToolOutcome.error, msg) } @@ -408,11 +528,14 @@ async function doExecuteRunTool( if (activeAbortController === abortController) { activeRunAbortByWorkflowId.delete(targetWorkflowId) } - const { setCurrentExecutionId: clearExecId } = useExecutionStore.getState() - clearExecId(targetWorkflowId, null) - clearExecutionPointer(targetWorkflowId) - consolePersistence.executionEnded() - setIsExecuting(targetWorkflowId, false) + const { setCurrentExecutionId: clearExecId, setActiveBlocks } = useExecutionStore.getState() + if (!leaveExecutionRecoverable && activeToolCallId === toolCallId) { + clearExecId(targetWorkflowId, null) + clearExecutionPointer(targetWorkflowId) + consolePersistence.executionEnded() + setIsExecuting(targetWorkflowId, false) + setActiveBlocks(targetWorkflowId, new Set()) + } } } @@ -454,54 +577,65 @@ function buildResultData(result: unknown): Record | undefined { */ async function reportCompletion( toolCallId: string, - status: MothershipStreamV1ToolOutcome, + status: AsyncConfirmationStatus, message?: string, data?: AsyncCompletionData ): Promise { - try { - const body = JSON.stringify({ - toolCallId, - status, - message: message || (status === 'success' ? 'Tool completed' : 'Tool failed'), - ...(data !== undefined ? { data } : {}), - }) - const res = await fetch(COPILOT_CONFIRM_API_PATH, { + const basePayload = { + toolCallId, + status, + message: message || (status === 'success' ? 'Tool completed' : 'Tool failed'), + ...(data !== undefined ? { data } : {}), + } + const send = async (body: string) => + fetch(COPILOT_CONFIRM_API_PATH, { method: 'POST', headers: { 'Content-Type': 'application/json', ...traceparentHeader() }, body, }) - const LARGE_PAYLOAD_THRESHOLD = 10 * 1024 * 1024 - const bodySize = new Blob([body]).size - if (!res.ok && isRecord(data) && bodySize > LARGE_PAYLOAD_THRESHOLD) { - const { logs: _logs, ...dataWithoutLogs } = data - logger.warn('[RunTool] reportCompletion failed with large payload, retrying without logs', { - toolCallId, - status: res.status, - bodySize, - }) - const retryRes = await fetch(COPILOT_CONFIRM_API_PATH, { - method: 'POST', - headers: { 'Content-Type': 'application/json', ...traceparentHeader() }, - body: JSON.stringify({ - toolCallId, - status, - message: message || (status === 'success' ? 'Tool completed' : 'Tool failed'), - data: dataWithoutLogs, - }), - }) - if (!retryRes.ok) { - logger.warn('[RunTool] reportCompletion retry also failed', { + + const body = JSON.stringify(basePayload) + const LARGE_PAYLOAD_THRESHOLD = 10 * 1024 * 1024 + const bodySize = new Blob([body]).size + let lastError: Error | null = null + + for (let attempt = 1; attempt <= 2; attempt++) { + try { + const res = await send(body) + if (res.ok) return + + if (isRecord(data) && bodySize > LARGE_PAYLOAD_THRESHOLD) { + const { logs: _logs, ...dataWithoutLogs } = data + logger.warn('[RunTool] reportCompletion failed with large payload, retrying without logs', { toolCallId, - status: retryRes.status, + status: res.status, + bodySize, }) + const retryRes = await send( + JSON.stringify({ + toolCallId, + status, + message: message || (status === 'success' ? 'Tool completed' : 'Tool failed'), + data: dataWithoutLogs, + }) + ) + if (retryRes.ok) return + lastError = new Error(`reportCompletion retry failed with status ${retryRes.status}`) + } else { + lastError = new Error(`reportCompletion failed with status ${res.status}`) } - } else if (!res.ok) { - logger.warn('[RunTool] reportCompletion failed', { toolCallId, status: res.status }) + } catch (err) { + lastError = toError(err) + } + + if (attempt < 2) { + await sleep(250) } - } catch (err) { - logger.error('[RunTool] reportCompletion error', { - toolCallId, - error: toError(err).message, - }) } + + logger.error('[RunTool] reportCompletion failed after retries', { + toolCallId, + error: lastError?.message, + }) + throw new CompletionReportError(lastError?.message ?? 'Failed to report tool completion') } diff --git a/apps/sim/lib/execution/event-buffer.test.ts b/apps/sim/lib/execution/event-buffer.test.ts new file mode 100644 index 00000000000..7e03ab8954d --- /dev/null +++ b/apps/sim/lib/execution/event-buffer.test.ts @@ -0,0 +1,317 @@ +/** + * @vitest-environment node + */ +import { beforeEach, describe, expect, it, vi } from 'vitest' +import type { ExecutionEventEntry } from '@/lib/execution/event-buffer' +import type { ExecutionEvent } from '@/lib/workflows/executor/execution-events' + +const { mockGetRedisClient, mockRedis, persistedEntries } = vi.hoisted(() => { + const persistedEntries: ExecutionEventEntry[] = [] + const mockRedis = { + incrby: vi.fn(), + hset: vi.fn(), + expire: vi.fn(), + hgetall: vi.fn(), + zrangebyscore: vi.fn(), + zremrangebyrank: vi.fn(), + pipeline: vi.fn(), + eval: vi.fn(), + } + const mockGetRedisClient = vi.fn(() => mockRedis) + return { mockGetRedisClient, mockRedis, persistedEntries } +}) + +vi.mock('@/lib/core/config/redis', () => ({ + getRedisClient: mockGetRedisClient, +})) + +import { + createExecutionEventWriter, + flushExecutionStreamReplayBuffer, + initializeExecutionStreamMeta, + readExecutionEventsState, +} from '@/lib/execution/event-buffer' + +function makeEvent(blockId: string): ExecutionEvent { + return { + type: 'block:started', + timestamp: new Date().toISOString(), + executionId: 'exec-1', + workflowId: 'wf-1', + data: { + blockId, + blockName: blockId, + blockType: 'function', + executionOrder: 1, + }, + } +} + +describe('execution event buffer', () => { + beforeEach(() => { + vi.clearAllMocks() + persistedEntries.length = 0 + mockGetRedisClient.mockReturnValue(mockRedis) + mockRedis.hgetall.mockResolvedValue({}) + mockRedis.zrangebyscore.mockResolvedValue([]) + mockRedis.zremrangebyrank.mockResolvedValue(0) + mockRedis.eval.mockImplementation( + async ( + _script: string, + _keyCount: number, + _eventsKey: string, + _seqKey: string, + _metaKey: string, + _ttl: number, + _eventLimit: number, + _updatedAt: string, + terminalStatus: string, + ...args: (string | number)[] + ) => { + for (let i = 0; i < args.length; i += 2) { + persistedEntries.push(JSON.parse(args[i + 1] as string) as ExecutionEventEntry) + } + if (terminalStatus) { + await mockRedis.hset('meta', { status: terminalStatus }) + } + return persistedEntries[0]?.eventId ?? false + } + ) + mockRedis.pipeline.mockImplementation(() => ({ + zadd: vi.fn((_key: string, ...args: (string | number)[]) => { + for (let i = 0; i < args.length; i += 2) { + persistedEntries.push(JSON.parse(args[i + 1] as string) as ExecutionEventEntry) + } + }), + expire: vi.fn(), + zremrangebyrank: vi.fn(), + exec: vi.fn().mockResolvedValue(undefined), + })) + }) + + it('serializes event id reservation so reconnect replay preserves write order', async () => { + let releaseReservation: ((value: number) => void) | undefined + mockRedis.incrby.mockReturnValueOnce( + new Promise((resolve) => { + releaseReservation = resolve + }) + ) + + const writer = createExecutionEventWriter('exec-1') + const firstWrite = writer.write(makeEvent('first')) + const secondWrite = writer.write(makeEvent('second')) + + await Promise.resolve() + expect(mockRedis.incrby).toHaveBeenCalledTimes(1) + + releaseReservation?.(100) + await expect(Promise.all([firstWrite, secondWrite])).resolves.toMatchObject([ + { eventId: 1 }, + { eventId: 2 }, + ]) + + await writer.close() + + expect(persistedEntries.map((entry) => entry.eventId)).toEqual([1, 2]) + expect( + persistedEntries.map((entry) => (entry.event.data as { blockId: string }).blockId) + ).toEqual(['first', 'second']) + }) + + it('flush waits for queued writes before returning', async () => { + let releaseReservation: ((value: number) => void) | undefined + mockRedis.incrby.mockReturnValueOnce( + new Promise((resolve) => { + releaseReservation = resolve + }) + ) + + const writer = createExecutionEventWriter('exec-1') + const write = writer.write(makeEvent('terminal')) + const flush = writer.flush() + + await Promise.resolve() + expect(persistedEntries).toEqual([]) + + releaseReservation?.(100) + await write + await flush + + expect(persistedEntries.map((entry) => entry.eventId)).toEqual([1]) + expect((persistedEntries[0].event.data as { blockId: string }).blockId).toBe('terminal') + }) + + it('flush drains events appended while another flush is in flight', async () => { + mockRedis.incrby.mockResolvedValue(100) + let releaseFirstFlush: (() => void) | undefined + const execCalls: Array<() => Promise> = [ + () => + new Promise((resolve) => { + releaseFirstFlush = resolve + }), + () => Promise.resolve(), + ] + + mockRedis.eval.mockImplementation(async (_script: string, ...args: unknown[]) => { + const batchEntries: ExecutionEventEntry[] = [] + const zaddArgs = args.slice(8) as (string | number)[] + for (let i = 0; i < zaddArgs.length; i += 2) { + batchEntries.push(JSON.parse(zaddArgs[i + 1] as string) as ExecutionEventEntry) + } + await (execCalls.shift() ?? (() => Promise.resolve()))() + persistedEntries.push(...batchEntries) + return persistedEntries[0]?.eventId ?? false + }) + mockRedis.pipeline.mockImplementation(() => { + const batchEntries: ExecutionEventEntry[] = [] + return { + zadd: vi.fn((_key: string, ...args: (string | number)[]) => { + for (let i = 0; i < args.length; i += 2) { + batchEntries.push(JSON.parse(args[i + 1] as string) as ExecutionEventEntry) + } + }), + expire: vi.fn(), + zremrangebyrank: vi.fn(), + exec: vi.fn(async () => { + await (execCalls.shift() ?? (() => Promise.resolve()))() + persistedEntries.push(...batchEntries) + }), + } + }) + + const writer = createExecutionEventWriter('exec-1') + await writer.write(makeEvent('first')) + const firstFlush = writer.flush() + + await Promise.resolve() + expect(persistedEntries).toEqual([]) + + await writer.write(makeEvent('terminal')) + const terminalFlush = writer.flush() + + releaseFirstFlush?.() + await firstFlush + await terminalFlush + + expect( + persistedEntries.map((entry) => (entry.event.data as { blockId: string }).blockId) + ).toEqual(['first', 'terminal']) + }) + + it('flush surfaces queued write failures', async () => { + mockRedis.incrby.mockRejectedValueOnce(new Error('redis reservation failed')) + + const writer = createExecutionEventWriter('exec-1') + await expect(writer.write(makeEvent('lost'))).rejects.toThrow('redis reservation failed') + await expect(writer.flush()).rejects.toThrow('redis reservation failed') + }) + + it('allows terminal finalization after a recovered queued write failure', async () => { + mockRedis.incrby + .mockRejectedValueOnce(new Error('redis reservation failed')) + .mockResolvedValueOnce(200) + + const writer = createExecutionEventWriter('exec-1') + await expect(writer.write(makeEvent('lost'))).rejects.toThrow('redis reservation failed') + await writer.write(makeEvent('terminal')) + + await expect(flushExecutionStreamReplayBuffer('exec-1', writer)).resolves.toBe(true) + expect(persistedEntries.map((entry) => entry.eventId)).toEqual([101]) + expect(mockRedis.hset).not.toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ status: 'complete' }) + ) + }) + + it('does not write terminal meta when the final replay flush fails', async () => { + mockRedis.incrby.mockResolvedValue(100) + mockRedis.eval.mockRejectedValue(new Error('redis flush failed')) + + const writer = createExecutionEventWriter('exec-1') + await writer.write(makeEvent('terminal')) + + await expect(flushExecutionStreamReplayBuffer('exec-1', writer)).resolves.toBe(false) + expect(mockRedis.hset).not.toHaveBeenCalled() + }) + + it('flushes replay events after a recovered final replay flush without terminal meta', async () => { + mockRedis.incrby.mockResolvedValue(100) + let flushAttempt = 0 + mockRedis.eval.mockImplementation(async (_script: string, ...args: unknown[]) => { + const zaddArgs = args.slice(8) as (string | number)[] + if (flushAttempt > 0) { + for (let i = 0; i < zaddArgs.length; i += 2) { + persistedEntries.push(JSON.parse(zaddArgs[i + 1] as string) as ExecutionEventEntry) + } + } + if (flushAttempt++ === 0) { + throw new Error('first flush failed') + } + return persistedEntries[0]?.eventId ?? false + }) + mockRedis.pipeline.mockImplementation(() => ({ + zadd: vi.fn((_key: string, ...args: (string | number)[]) => { + if (flushAttempt > 0) { + for (let i = 0; i < args.length; i += 2) { + persistedEntries.push(JSON.parse(args[i + 1] as string) as ExecutionEventEntry) + } + } + }), + expire: vi.fn(), + zremrangebyrank: vi.fn(), + exec: vi.fn(async () => { + if (flushAttempt++ === 0) { + throw new Error('first flush failed') + } + }), + })) + + const writer = createExecutionEventWriter('exec-1') + await writer.write(makeEvent('terminal')) + + await expect(flushExecutionStreamReplayBuffer('exec-1', writer)).resolves.toBe(true) + expect(persistedEntries.map((entry) => entry.eventId)).toEqual([1]) + expect(mockRedis.hset).not.toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ status: 'complete' }) + ) + }) + + it('writes terminal event and terminal meta atomically through writeTerminal', async () => { + mockRedis.incrby.mockResolvedValue(100) + + const writer = createExecutionEventWriter('exec-1') + await writer.writeTerminal(makeEvent('terminal'), 'complete') + + expect(persistedEntries.map((entry) => entry.eventId)).toEqual([1]) + expect(mockRedis.hset).toHaveBeenCalledWith('meta', { status: 'complete' }) + }) + + it('retries active meta initialization before giving up', async () => { + mockRedis.hset.mockRejectedValueOnce(new Error('meta write failed')).mockResolvedValueOnce(1) + + await expect( + initializeExecutionStreamMeta('exec-1', { userId: 'user-1', workflowId: 'wf-1' }) + ).resolves.toBe(true) + + expect(mockRedis.hset).toHaveBeenCalledTimes(2) + expect(mockRedis.hset).toHaveBeenLastCalledWith( + 'execution:stream:exec-1:meta', + expect.objectContaining({ + status: 'active', + userId: 'user-1', + workflowId: 'wf-1', + }) + ) + }) + + it('reports pruned replay buffers before reading incomplete events', async () => { + mockRedis.hgetall.mockResolvedValue({ status: 'active', earliestEventId: '10' }) + + await expect(readExecutionEventsState('exec-1', 0)).resolves.toEqual({ + status: 'pruned', + earliestEventId: 10, + }) + expect(mockRedis.zrangebyscore).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/lib/execution/event-buffer.ts b/apps/sim/lib/execution/event-buffer.ts index c9919852a4e..02f5d750b18 100644 --- a/apps/sim/lib/execution/event-buffer.ts +++ b/apps/sim/lib/execution/event-buffer.ts @@ -1,5 +1,6 @@ import { createLogger } from '@sim/logger' import { toError } from '@sim/utils/errors' +import { env } from '@/lib/core/config/env' import { getRedisClient } from '@/lib/core/config/redis' import type { ExecutionEvent } from '@/lib/workflows/executor/execution-events' @@ -12,6 +13,27 @@ const RESERVE_BATCH = 100 const FLUSH_INTERVAL_MS = 15 const FLUSH_MAX_BATCH = 200 const MAX_PENDING_EVENTS = 1000 +const ACTIVE_META_ATTEMPTS = 3 +const FINALIZE_FLUSH_ATTEMPTS = 2 +const FLUSH_EVENTS_SCRIPT = ` +local terminal_status = ARGV[4] +for i = 5, #ARGV, 2 do + redis.call('ZADD', KEYS[1], ARGV[i], ARGV[i + 1]) +end +redis.call('EXPIRE', KEYS[1], tonumber(ARGV[1])) +redis.call('EXPIRE', KEYS[2], tonumber(ARGV[1])) +redis.call('ZREMRANGEBYRANK', KEYS[1], 0, -tonumber(ARGV[2]) - 1) +local oldest = redis.call('ZRANGE', KEYS[1], 0, 0, 'WITHSCORES') +if terminal_status ~= '' then + redis.call('HSET', KEYS[3], 'status', terminal_status, 'updatedAt', ARGV[3]) + redis.call('EXPIRE', KEYS[3], tonumber(ARGV[1])) +end +if oldest[2] then + redis.call('HSET', KEYS[3], 'earliestEventId', tostring(math.floor(tonumber(oldest[2]))), 'updatedAt', ARGV[3]) + redis.call('EXPIRE', KEYS[3], tonumber(ARGV[1])) +end +return oldest[2] or false +` function getEventsKey(executionId: string) { return `${REDIS_PREFIX}${executionId}:events` @@ -36,28 +58,242 @@ export interface ExecutionStreamMeta { userId?: string workflowId?: string updatedAt?: string + earliestEventId?: number + replayStartEventId?: number } +export type TerminalExecutionStreamStatus = Exclude + +export type ExecutionMetaReadResult = + | { status: 'found'; meta: ExecutionStreamMeta } + | { status: 'missing' } + | { status: 'unavailable'; error: string } + +export type ExecutionEventsReadResult = + | { status: 'ok'; events: ExecutionEventEntry[] } + | { status: 'pruned'; earliestEventId: number } + | { status: 'unavailable'; error: string } + export interface ExecutionEventEntry { eventId: number executionId: string event: ExecutionEvent } +interface MemoryExecutionStream { + events: ExecutionEventEntry[] + meta: ExecutionStreamMeta | null + nextEventId: number + expiresAt: number +} + export interface ExecutionEventWriter { write: (event: ExecutionEvent) => Promise + writeTerminal: ( + event: ExecutionEvent, + status: TerminalExecutionStreamStatus + ) => Promise flush: () => Promise close: () => Promise } +const memoryExecutionStreams = new Map() + +function canUseMemoryEventBuffer(): boolean { + return typeof window === 'undefined' && !env.REDIS_URL +} + +function pruneExpiredMemoryStreams(now = Date.now()): void { + for (const [executionId, stream] of memoryExecutionStreams) { + if (stream.expiresAt <= now) { + memoryExecutionStreams.delete(executionId) + } + } +} + +function getMemoryStream(executionId: string): MemoryExecutionStream { + pruneExpiredMemoryStreams() + let stream = memoryExecutionStreams.get(executionId) + if (!stream) { + stream = { + events: [], + meta: null, + nextEventId: 1, + expiresAt: Date.now() + TTL_SECONDS * 1000, + } + memoryExecutionStreams.set(executionId, stream) + } + return stream +} + +function touchMemoryStream(stream: MemoryExecutionStream): void { + stream.expiresAt = Date.now() + TTL_SECONDS * 1000 +} + +function isReplayBeforeAvailableEvents( + afterEventId: number, + earliestEventId?: number, + replayStartEventId?: number +): earliestEventId is number { + if (earliestEventId === undefined || !Number.isFinite(earliestEventId)) return false + if ( + afterEventId === 0 && + replayStartEventId !== undefined && + Number.isFinite(replayStartEventId) + ) { + return earliestEventId > replayStartEventId + } + return afterEventId + 1 < earliestEventId +} + +function readMemoryMeta(executionId: string): ExecutionMetaReadResult { + pruneExpiredMemoryStreams() + const stream = memoryExecutionStreams.get(executionId) + if (!stream?.meta) return { status: 'missing' } + return { status: 'found', meta: stream.meta } +} + +function readMemoryEvents(executionId: string, afterEventId: number): ExecutionEventsReadResult { + pruneExpiredMemoryStreams() + const stream = memoryExecutionStreams.get(executionId) + if (!stream) return { status: 'ok', events: [] } + const earliestEventId = stream.meta?.earliestEventId + if ( + isReplayBeforeAvailableEvents(afterEventId, earliestEventId, stream.meta?.replayStartEventId) + ) { + return { status: 'pruned', earliestEventId } + } + return { + status: 'ok', + events: stream.events.filter((entry) => entry.eventId > afterEventId), + } +} + +function createMemoryExecutionEventWriter(executionId: string): ExecutionEventWriter { + const writeMemoryEvent = async (event: ExecutionEvent) => { + const stream = getMemoryStream(executionId) + const entry = { + eventId: stream.nextEventId++, + executionId, + event, + } + stream.events.push(entry) + if (stream.events.length > EVENT_LIMIT) { + stream.events = stream.events.slice(-EVENT_LIMIT) + const earliestEventId = stream.events[0]?.eventId + if (earliestEventId !== undefined && stream.meta) { + stream.meta = { + ...stream.meta, + earliestEventId, + updatedAt: new Date().toISOString(), + } + } + } + touchMemoryStream(stream) + return entry + } + + return { + write: writeMemoryEvent, + writeTerminal: async (event, status) => { + const entry = await writeMemoryEvent(event) + const stream = getMemoryStream(executionId) + stream.meta = { + ...stream.meta, + status, + updatedAt: new Date().toISOString(), + } + touchMemoryStream(stream) + return entry + }, + flush: async () => {}, + close: async () => {}, + } +} + +export async function flushExecutionStreamReplayBuffer( + executionId: string, + writer: ExecutionEventWriter +): Promise { + let writerClosed = false + for (let attempt = 1; attempt <= FINALIZE_FLUSH_ATTEMPTS; attempt++) { + try { + if (!writerClosed) { + await writer.close() + writerClosed = true + } + return true + } catch (error) { + logger.warn('Failed to flush execution stream replay buffer during finalization', { + executionId, + attempt, + error: toError(error).message, + }) + } + } + return false +} + +export async function resetExecutionStreamBuffer(executionId: string): Promise { + if (canUseMemoryEventBuffer()) { + const stream = getMemoryStream(executionId) + stream.events = [] + stream.meta = { + status: 'active', + replayStartEventId: stream.nextEventId, + updatedAt: new Date().toISOString(), + } + stream.expiresAt = Date.now() + TTL_SECONDS * 1000 + return true + } + + const redis = getRedisClient() + if (!redis) { + logger.warn('resetExecutionStreamBuffer: Redis client unavailable', { executionId }) + return false + } + + try { + const currentSequence = Number(await redis.get(getSeqKey(executionId)).catch(() => 0)) + const replayStartEventId = Number.isFinite(currentSequence) ? currentSequence + 1 : 1 + const metaKey = getMetaKey(executionId) + await redis.del(getEventsKey(executionId), metaKey) + await redis.hset(metaKey, { + replayStartEventId: String(replayStartEventId), + updatedAt: new Date().toISOString(), + }) + await redis.expire(metaKey, TTL_SECONDS) + return true + } catch (error) { + logger.warn('Failed to reset execution stream buffer', { + executionId, + error: toError(error).message, + }) + return false + } +} + export async function setExecutionMeta( executionId: string, meta: Partial -): Promise { +): Promise { const redis = getRedisClient() if (!redis) { + if (canUseMemoryEventBuffer()) { + const stream = getMemoryStream(executionId) + const status = meta.status ?? stream.meta?.status + if (!status) return false + stream.meta = { + ...stream.meta, + ...meta, + status, + updatedAt: new Date().toISOString(), + } + touchMemoryStream(stream) + return true + } logger.warn('setExecutionMeta: Redis client unavailable', { executionId }) - return + return false } try { const key = getMetaKey(executionId) @@ -67,71 +303,160 @@ export async function setExecutionMeta( if (meta.status) payload.status = meta.status if (meta.userId) payload.userId = meta.userId if (meta.workflowId) payload.workflowId = meta.workflowId + if (meta.earliestEventId !== undefined) payload.earliestEventId = String(meta.earliestEventId) + if (meta.replayStartEventId !== undefined) { + payload.replayStartEventId = String(meta.replayStartEventId) + } await redis.hset(key, payload) await redis.expire(key, TTL_SECONDS) + return true } catch (error) { logger.warn('Failed to update execution meta', { executionId, error: toError(error).message, }) + return false } } -export async function getExecutionMeta(executionId: string): Promise { +export async function initializeExecutionStreamMeta( + executionId: string, + meta: Omit & { status?: 'active' } +): Promise { + for (let attempt = 1; attempt <= ACTIVE_META_ATTEMPTS; attempt++) { + const metaPersisted = await setExecutionMeta(executionId, { + ...meta, + status: 'active', + }) + if (metaPersisted) return true + logger.warn('Failed to persist active execution meta during initialization', { + executionId, + attempt, + }) + } + return false +} + +export async function readExecutionMetaState( + executionId: string +): Promise { const redis = getRedisClient() if (!redis) { + if (canUseMemoryEventBuffer()) { + return readMemoryMeta(executionId) + } logger.warn('getExecutionMeta: Redis client unavailable', { executionId }) - return null + return { status: 'unavailable', error: 'Redis client unavailable' } } try { const key = getMetaKey(executionId) const meta = await redis.hgetall(key) - if (!meta || Object.keys(meta).length === 0) return null - if (!isExecutionStreamStatus(meta.status)) return null + if (!meta || Object.keys(meta).length === 0) return { status: 'missing' } + if (!isExecutionStreamStatus(meta.status)) return { status: 'missing' } return { - status: meta.status, - userId: meta.userId, - workflowId: meta.workflowId, - updatedAt: meta.updatedAt, + status: 'found', + meta: { + status: meta.status, + userId: meta.userId, + workflowId: meta.workflowId, + updatedAt: meta.updatedAt, + earliestEventId: + meta.earliestEventId !== undefined ? Number(meta.earliestEventId) : undefined, + replayStartEventId: + meta.replayStartEventId !== undefined ? Number(meta.replayStartEventId) : undefined, + }, } } catch (error) { + const message = toError(error).message logger.warn('Failed to read execution meta', { executionId, - error: toError(error).message, + error: message, }) + return { status: 'unavailable', error: message } + } +} + +export async function getExecutionMeta(executionId: string): Promise { + const result = await readExecutionMetaState(executionId) + if (result.status === 'found') return result.meta + if (result.status === 'unavailable') { return null } + return null } export async function readExecutionEvents( executionId: string, afterEventId: number ): Promise { + const result = await readExecutionEventsState(executionId, afterEventId) + return result.status === 'ok' ? result.events : [] +} + +export async function readExecutionEventsState( + executionId: string, + afterEventId: number +): Promise { const redis = getRedisClient() - if (!redis) return [] + if (!redis) { + if (canUseMemoryEventBuffer()) { + return readMemoryEvents(executionId, afterEventId) + } + return { status: 'unavailable', error: 'Redis client unavailable' } + } try { + const meta = await redis.hgetall(getMetaKey(executionId)) + const earliestEventId = + meta?.earliestEventId !== undefined ? Number(meta.earliestEventId) : undefined + const replayStartEventId = + meta?.replayStartEventId !== undefined ? Number(meta.replayStartEventId) : undefined + if (isReplayBeforeAvailableEvents(afterEventId, earliestEventId, replayStartEventId)) { + return { status: 'pruned', earliestEventId } + } + const raw = await redis.zrangebyscore(getEventsKey(executionId), afterEventId + 1, '+inf') - return raw - .map((entry) => { - try { - return JSON.parse(entry) as ExecutionEventEntry - } catch { - return null - } - }) - .filter((entry): entry is ExecutionEventEntry => Boolean(entry)) + const latestMeta = await redis.hgetall(getMetaKey(executionId)) + const latestEarliestEventId = + latestMeta?.earliestEventId !== undefined ? Number(latestMeta.earliestEventId) : undefined + const latestReplayStartEventId = + latestMeta?.replayStartEventId !== undefined + ? Number(latestMeta.replayStartEventId) + : undefined + if ( + isReplayBeforeAvailableEvents(afterEventId, latestEarliestEventId, latestReplayStartEventId) + ) { + return { status: 'pruned', earliestEventId: latestEarliestEventId } + } + + return { + status: 'ok', + events: raw + .map((entry) => { + try { + return JSON.parse(entry) as ExecutionEventEntry + } catch { + return null + } + }) + .filter((entry): entry is ExecutionEventEntry => Boolean(entry)), + } } catch (error) { + const message = toError(error).message logger.warn('Failed to read execution events', { executionId, - error: toError(error).message, + error: message, }) - return [] + return { status: 'unavailable', error: message } } } export function createExecutionEventWriter(executionId: string): ExecutionEventWriter { const redis = getRedisClient() if (!redis) { + if (canUseMemoryEventBuffer()) { + logger.info('createExecutionEventWriter: using in-memory event buffer', { executionId }) + return createMemoryExecutionEventWriter(executionId) + } logger.warn( 'createExecutionEventWriter: Redis client unavailable, events will not be buffered', { @@ -140,6 +465,9 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW ) return { write: async (event) => ({ eventId: 0, executionId, event }), + writeTerminal: async () => { + throw new Error(`Execution event buffer unavailable for ${executionId}`) + }, flush: async () => {}, close: async () => {}, } @@ -154,7 +482,7 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW if (flushTimer) return flushTimer = setTimeout(() => { flushTimer = null - void flush() + void flushPending() }, FLUSH_INTERVAL_MS) } @@ -168,12 +496,14 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW } } - let flushPromise: Promise | null = null + let flushPromise: Promise | null = null let closed = false + let writeQueue: Promise = Promise.resolve() const inflightWrites = new Set>() + let writeFailure: Error | null = null - const doFlush = async () => { - if (pending.length === 0) return + const doFlush = async (terminalStatus?: TerminalExecutionStreamStatus): Promise => { + if (pending.length === 0) return true const batch = pending pending = [] try { @@ -182,12 +512,19 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW for (const entry of batch) { zaddArgs.push(entry.eventId, JSON.stringify(entry)) } - const pipeline = redis.pipeline() - pipeline.zadd(key, ...zaddArgs) - pipeline.expire(key, TTL_SECONDS) - pipeline.expire(getSeqKey(executionId), TTL_SECONDS) - pipeline.zremrangebyrank(key, 0, -EVENT_LIMIT - 1) - await pipeline.exec() + await redis.eval( + FLUSH_EVENTS_SCRIPT, + 3, + key, + getSeqKey(executionId), + getMetaKey(executionId), + TTL_SECONDS, + EVENT_LIMIT, + new Date().toISOString(), + terminalStatus ?? '', + ...zaddArgs + ) + return true } catch (error) { logger.warn('Failed to flush execution events', { executionId, @@ -205,25 +542,37 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW remaining: pending.length, }) } + return false } } - const flush = async () => { - if (flushPromise) { - await flushPromise - return - } - flushPromise = doFlush() - try { - await flushPromise - } finally { - flushPromise = null - if (pending.length > 0) scheduleFlush() + const flushPending = async ( + scheduleOnFailure = true, + terminalStatus?: TerminalExecutionStreamStatus + ): Promise => { + while (true) { + if (flushPromise) { + const ok = await flushPromise + if (!ok) return false + continue + } + if (pending.length === 0) return true + + flushPromise = doFlush(terminalStatus) + let ok = false + try { + ok = await flushPromise + } finally { + flushPromise = null + } + if (!ok) { + if (scheduleOnFailure && pending.length > 0) scheduleFlush() + return false + } } } const writeCore = async (event: ExecutionEvent): Promise => { - if (closed) return { eventId: 0, executionId, event } if (nextEventId === 0 || nextEventId > maxReservedId) { await reserveIds(1) } @@ -231,7 +580,7 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW const entry: ExecutionEventEntry = { eventId, executionId, event } pending.push(entry) if (pending.length >= FLUSH_MAX_BATCH) { - await flush() + await flushPending() } else { scheduleFlush() } @@ -239,7 +588,54 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW } const write = (event: ExecutionEvent): Promise => { - const p = writeCore(event) + if (closed) return Promise.resolve({ eventId: 0, executionId, event }) + const p = writeQueue.then(() => writeCore(event)) + writeQueue = p.then( + () => { + writeFailure = null + }, + (error) => { + writeFailure = toError(error) + } + ) + inflightWrites.add(p) + const remove = () => inflightWrites.delete(p) + p.then(remove, remove) + return p + } + + const writeTerminal = ( + event: ExecutionEvent, + status: TerminalExecutionStreamStatus + ): Promise => { + if (closed) return Promise.resolve({ eventId: 0, executionId, event }) + const p = writeQueue.then(async () => { + if (flushTimer) { + clearTimeout(flushTimer) + flushTimer = null + } + if (nextEventId === 0 || nextEventId > maxReservedId) { + await reserveIds(1) + } + const eventId = nextEventId++ + const entry: ExecutionEventEntry = { eventId, executionId, event } + pending.push(entry) + const ok = await flushPending(false, status) + if (!ok) { + pending = pending.filter((pendingEntry) => pendingEntry !== entry) + throw new Error(`Failed to flush terminal execution event for ${executionId}`) + } + closed = true + return entry + }) + writeQueue = p.then( + () => { + writeFailure = null + }, + (error) => { + writeFailure = toError(error) + } + ) inflightWrites.add(p) const remove = () => inflightWrites.delete(p) p.then(remove, remove) @@ -258,10 +654,23 @@ export function createExecutionEventWriter(executionId: string): ExecutionEventW if (flushPromise) { await flushPromise } - if (pending.length > 0) { - await doFlush() + await flushCore(false) + } + + const flushCore = async (scheduleOnFailure: boolean) => { + await writeQueue + const ok = await flushPending(scheduleOnFailure) + if (writeFailure) { + throw writeFailure + } + if (!ok) { + throw new Error(`Failed to flush execution events for ${executionId}`) } } - return { write, flush, close } + const flush = async () => { + await flushCore(true) + } + + return { write, writeTerminal, flush, close } } diff --git a/apps/sim/lib/logs/execution/logging-session.ts b/apps/sim/lib/logs/execution/logging-session.ts index e5179a163ef..a2df1c58b7c 100644 --- a/apps/sim/lib/logs/execution/logging-session.ts +++ b/apps/sim/lib/logs/execution/logging-session.ts @@ -652,6 +652,18 @@ export class LoggingSession { const endTime = endedAt ? new Date(endedAt) : new Date() const durationMs = typeof totalDurationMs === 'number' ? totalDurationMs : 0 + const currentLog = await db + .select({ status: workflowExecutionLogs.status }) + .from(workflowExecutionLogs) + .where(eq(workflowExecutionLogs.executionId, this.executionId)) + .limit(1) + .then((rows) => rows[0]) + + if (currentLog?.status === 'cancelled') { + this.completed = true + return + } + const costSummary = traceSpans?.length ? calculateCostSummary(traceSpans) : { @@ -739,6 +751,18 @@ export class LoggingSession { const endTime = endedAt ? new Date(endedAt) : new Date() const durationMs = typeof totalDurationMs === 'number' ? totalDurationMs : 0 + const currentLog = await db + .select({ status: workflowExecutionLogs.status }) + .from(workflowExecutionLogs) + .where(eq(workflowExecutionLogs.executionId, this.executionId)) + .limit(1) + .then((rows) => rows[0]) + + if (currentLog?.status === 'cancelled') { + this.completed = true + return + } + const costSummary = traceSpans?.length ? calculateCostSummary(traceSpans) : { diff --git a/apps/sim/lib/logs/execution/trace-spans/span-factory.ts b/apps/sim/lib/logs/execution/trace-spans/span-factory.ts index 376eb483eda..3b7e874847c 100644 --- a/apps/sim/lib/logs/execution/trace-spans/span-factory.ts +++ b/apps/sim/lib/logs/execution/trace-spans/span-factory.ts @@ -65,6 +65,7 @@ function createBaseSpan(log: ValidBlockLog): TraceSpan { status: log.error ? 'error' : 'success', children: [], blockId: log.blockId, + executionOrder: log.executionOrder, input: log.input, output, ...(childIds ?? {}), diff --git a/apps/sim/lib/logs/types.ts b/apps/sim/lib/logs/types.ts index 60bc6fd1ec3..e0c893ed393 100644 --- a/apps/sim/lib/logs/types.ts +++ b/apps/sim/lib/logs/types.ts @@ -214,6 +214,7 @@ export interface TraceSpan { tokens?: TokenInfo relativeStartMs?: number blockId?: string + executionOrder?: number input?: Record output?: Record childWorkflowSnapshotId?: string diff --git a/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts b/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts index b6c4ea92398..a5508b5476b 100644 --- a/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts +++ b/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts @@ -477,9 +477,7 @@ export async function trackChatUpload( size, }) - logger.info( - `Tracked chat upload: ${fileName} (display: ${candidate}) for chat ${chatId}` - ) + logger.info(`Tracked chat upload: ${fileName} (display: ${candidate}) for chat ${chatId}`) return { displayName: candidate } } catch (error) { // Other 23505s (e.g. active-key collision from a racing same-s3Key insert) signal diff --git a/apps/sim/lib/workflows/executor/execution-core.test.ts b/apps/sim/lib/workflows/executor/execution-core.test.ts index da8092d3366..e9370ef9454 100644 --- a/apps/sim/lib/workflows/executor/execution-core.test.ts +++ b/apps/sim/lib/workflows/executor/execution-core.test.ts @@ -362,6 +362,68 @@ describe('executeWorkflowCore terminal finalization sequencing', () => { ]) }) + it('awaits fire-and-forget block callbacks before returning terminal result', async () => { + let releaseBlockComplete: (() => void) | undefined + let markCallbackStarted: (() => void) | undefined + const blockCompletePromise = new Promise((resolve) => { + releaseBlockComplete = resolve + }) + const callbackStartedPromise = new Promise((resolve) => { + markCallbackStarted = resolve + }) + const callOrder: string[] = [] + let hasReturned = false + + executorExecuteMock.mockImplementation(async () => { + const contextExtensions = executorConstructorMock.mock.calls[0]?.[0]?.contextExtensions + void contextExtensions.onBlockComplete('block-1', 'Fetch', 'api', { + input: {}, + output: { done: true }, + executionTime: 10, + startedAt: new Date().toISOString(), + executionOrder: 1, + endedAt: new Date().toISOString(), + }) + callOrder.push('executor:return') + + return { + success: true, + status: 'completed', + output: { done: true }, + logs: [], + metadata: { duration: 123, startTime: 'start', endTime: 'end' }, + } + }) + + const executionPromise = executeWorkflowCore({ + snapshot: createSnapshot() as any, + callbacks: { + onBlockComplete: async () => { + callOrder.push('callback:start') + markCallbackStarted?.() + await blockCompletePromise + callOrder.push('callback:end') + }, + }, + loggingSession: loggingSession as any, + }).then((result) => { + hasReturned = true + callOrder.push('core:return') + return result + }) + + await callbackStartedPromise + + expect(callOrder).toEqual(['executor:return', 'callback:start']) + expect(hasReturned).toBe(false) + + releaseBlockComplete?.() + const result = await executionPromise + + expect(result.status).toBe('completed') + expect(callOrder).toEqual(['executor:return', 'callback:start', 'callback:end', 'core:return']) + }) + it('preserves successful execution when success finalization throws', async () => { executorExecuteMock.mockResolvedValue({ success: true, diff --git a/apps/sim/lib/workflows/executor/execution-core.ts b/apps/sim/lib/workflows/executor/execution-core.ts index 65a45e781b6..aec5f956c50 100644 --- a/apps/sim/lib/workflows/executor/execution-core.ts +++ b/apps/sim/lib/workflows/executor/execution-core.ts @@ -278,6 +278,22 @@ export async function executeWorkflowCore( let processedInput = input || {} let deploymentVersionId: string | undefined let loggingStarted = false + const pendingLifecycleCallbacks = new Set>() + + const trackLifecycleCallback = (promise: Promise) => { + pendingLifecycleCallbacks.add(promise) + void promise + .finally(() => { + pendingLifecycleCallbacks.delete(promise) + }) + .catch(() => {}) + } + + const waitForLifecycleCallbacks = async () => { + while (pendingLifecycleCallbacks.size > 0) { + await Promise.allSettled([...pendingLifecycleCallbacks]) + } + } try { let blocks @@ -358,14 +374,22 @@ export async function executeWorkflowCore( // Check if this is a resume execution before trigger resolution const resumeFromSnapshot = metadata.resumeFromSnapshot === true const resumePendingQueue = snapshot.state?.pendingQueue + const resumeRemainingEdges = snapshot.state?.remainingEdges + const resumeTerminalNoop = metadata.resumeTerminalNoop === true let resolvedTriggerBlockId = triggerBlockId - // For resume executions, skip trigger resolution since we have a pending queue - if (resumeFromSnapshot && resumePendingQueue?.length) { + // Resume executions derive their queue from the snapshot. Even an empty + // queue is meaningful: a terminal pause block has no downstream work. + if ( + resumeFromSnapshot && + (resumePendingQueue !== undefined || resumeRemainingEdges !== undefined || resumeTerminalNoop) + ) { resolvedTriggerBlockId = undefined logger.info(`[${requestId}] Skipping trigger resolution for resume execution`, { - pendingQueueLength: resumePendingQueue.length, + pendingQueueLength: resumePendingQueue?.length ?? 0, + remainingEdgeCount: resumeRemainingEdges?.length ?? 0, + resumeTerminalNoop, }) } else if (!triggerBlockId) { const executionKind = @@ -425,7 +449,7 @@ export async function executeWorkflowCore( }) } - const wrappedOnBlockComplete = async ( + const wrappedOnBlockComplete = ( blockId: string, blockName: string, blockType: string, @@ -439,36 +463,47 @@ export async function executeWorkflowCore( iterationContext?: IterationContext, childWorkflowContext?: ChildWorkflowContext ) => { - try { + let persistenceSucceeded = false + const persistencePromise = (async () => { await loggingSession.onBlockComplete(blockId, blockName, blockType, output) - if (onBlockComplete) { - void onBlockComplete( + persistenceSucceeded = true + })().catch((error) => { + logger.warn(`[${requestId}] Block completion persistence failed`, { + executionId, + blockId, + blockType, + error, + }) + }) + + const lifecyclePromise = (async () => { + await persistencePromise + if (!persistenceSucceeded || !onBlockComplete) return + + try { + await onBlockComplete( blockId, blockName, blockType, output, iterationContext, childWorkflowContext - ).catch((error) => { - logger.warn(`[${requestId}] Block completion callback failed`, { - executionId, - blockId, - blockType, - error, - }) + ) + } catch (error) { + logger.warn(`[${requestId}] Block completion callback failed`, { + executionId, + blockId, + blockType, + error, }) } - } catch (error) { - logger.warn(`[${requestId}] Block completion persistence failed`, { - executionId, - blockId, - blockType, - error, - }) - } + })() + + trackLifecycleCallback(lifecyclePromise) + return persistencePromise } - const wrappedOnBlockStart = async ( + const wrappedOnBlockStart = ( blockId: string, blockName: string, blockType: string, @@ -476,33 +511,44 @@ export async function executeWorkflowCore( iterationContext?: IterationContext, childWorkflowContext?: ChildWorkflowContext ) => { - try { + let persistenceSucceeded = false + const persistencePromise = (async () => { await loggingSession.onBlockStart(blockId, blockName, blockType, new Date().toISOString()) - if (onBlockStart) { - void onBlockStart( + persistenceSucceeded = true + })().catch((error) => { + logger.warn(`[${requestId}] Block start persistence failed`, { + executionId, + blockId, + blockType, + error, + }) + }) + + const lifecyclePromise = (async () => { + await persistencePromise + if (!persistenceSucceeded || !onBlockStart) return + + try { + await onBlockStart( blockId, blockName, blockType, executionOrder, iterationContext, childWorkflowContext - ).catch((error) => { - logger.warn(`[${requestId}] Block start callback failed`, { - executionId, - blockId, - blockType, - error, - }) + ) + } catch (error) { + logger.warn(`[${requestId}] Block start callback failed`, { + executionId, + blockId, + blockType, + error, }) } - } catch (error) { - logger.warn(`[${requestId}] Block start persistence failed`, { - executionId, - blockId, - blockType, - error, - }) - } + })() + + trackLifecycleCallback(lifecyclePromise) + return persistencePromise } const contextExtensions: ContextExtensions = { @@ -561,6 +607,8 @@ export async function executeWorkflowCore( )) as ExecutionResult) : ((await executorInstance.execute(workflowId, resolvedTriggerBlockId)) as ExecutionResult) + await waitForLifecycleCallbacks() + loggingSession.setPostExecutionPromise( (async () => { try { @@ -595,6 +643,8 @@ export async function executeWorkflowCore( } catch (error: unknown) { logger.error(`[${requestId}] Execution failed:`, error) + await waitForLifecycleCallbacks() + if (!loggingStarted) { loggingStarted = await loggingSession.safeStart({ userId, diff --git a/apps/sim/lib/workflows/executor/execution-events.ts b/apps/sim/lib/workflows/executor/execution-events.ts index b8089ea2146..337045143ef 100644 --- a/apps/sim/lib/workflows/executor/execution-events.ts +++ b/apps/sim/lib/workflows/executor/execution-events.ts @@ -68,6 +68,8 @@ export interface ExecutionPausedEvent extends BaseExecutionEvent { duration: number startTime: string endTime: string + /** Authoritative per-block terminal states from the server's blockLogs. */ + finalBlockLogs?: BlockLog[] } } @@ -184,7 +186,12 @@ export interface BlockChildWorkflowStartedEvent extends BaseExecutionEvent { blockId: string childWorkflowInstanceId: string iterationCurrent?: number + iterationTotal?: number + iterationType?: SubflowType iterationContainerId?: string + parentIterations?: ParentIteration[] + childWorkflowBlockId?: string + childWorkflowName?: string executionOrder?: number } } @@ -431,13 +438,14 @@ export function createExecutionCallbacks(options: { } } - const onChildWorkflowInstanceReady = ( + const onChildWorkflowInstanceReady = async ( blockId: string, childWorkflowInstanceId: string, iterationContext?: IterationContext, - executionOrder?: number + executionOrder?: number, + childWorkflowContext?: ChildWorkflowContext ) => { - void sendBufferedEvent({ + await sendBufferedEvent({ type: 'block:childWorkflowStarted', timestamp: new Date().toISOString(), executionId, @@ -447,7 +455,16 @@ export function createExecutionCallbacks(options: { childWorkflowInstanceId, ...(iterationContext && { iterationCurrent: iterationContext.iterationCurrent, + iterationTotal: iterationContext.iterationTotal, + iterationType: iterationContext.iterationType, iterationContainerId: iterationContext.iterationContainerId, + ...(iterationContext.parentIterations?.length && { + parentIterations: iterationContext.parentIterations, + }), + }), + ...(childWorkflowContext && { + childWorkflowBlockId: childWorkflowContext.parentBlockId, + childWorkflowName: childWorkflowContext.workflowName, }), ...(executionOrder !== undefined && { executionOrder }), }, diff --git a/apps/sim/lib/workflows/executor/human-in-the-loop-manager.ts b/apps/sim/lib/workflows/executor/human-in-the-loop-manager.ts index ff4a76be0c7..330fe93e14c 100644 --- a/apps/sim/lib/workflows/executor/human-in-the-loop-manager.ts +++ b/apps/sim/lib/workflows/executor/human-in-the-loop-manager.ts @@ -6,7 +6,13 @@ import { generateId } from '@sim/utils/id' import { and, asc, desc, eq, inArray, lt, type SQL, sql } from 'drizzle-orm' import type { Edge } from 'reactflow' import { createTimeoutAbortController, getTimeoutErrorMessage } from '@/lib/core/execution-limits' -import { createExecutionEventWriter, setExecutionMeta } from '@/lib/execution/event-buffer' +import { + createExecutionEventWriter, + flushExecutionStreamReplayBuffer, + initializeExecutionStreamMeta, + resetExecutionStreamBuffer, + type TerminalExecutionStreamStatus, +} from '@/lib/execution/event-buffer' import { preprocessExecution } from '@/lib/execution/preprocessing' import { LoggingSession } from '@/lib/logs/execution/logging-session' import { executeWorkflowCore } from '@/lib/workflows/executor/execution-core' @@ -16,6 +22,7 @@ import type { ChildWorkflowContext, ExecutionCallbacks, IterationContext, + SerializableExecutionState, } from '@/executor/execution/types' import type { ExecutionResult, @@ -29,11 +36,19 @@ import { filterOutputForLog } from '@/executor/utils/output-filter' import type { SerializedConnection } from '@/serializer/types' const logger = createLogger('HumanInTheLoopManager') +const RUN_BUFFER_UNAVAILABLE_ERROR = 'Run buffer temporarily unavailable' +const TERMINAL_PUBLISH_ERROR = 'Run buffer terminal event publish failed' +const RESUMABLE_PAUSED_STATUSES = ['paused', 'partially_resumed'] as const +const CANCELLABLE_PAUSED_STATUSES = ['paused', 'partially_resumed'] as const function isRecord(value: unknown): value is Record { return value !== null && typeof value === 'object' && !Array.isArray(value) } +function isResumablePausedStatus(status: string): boolean { + return RESUMABLE_PAUSED_STATUSES.includes(status as (typeof RESUMABLE_PAUSED_STATUSES)[number]) +} + interface ResumeQueueEntrySummary { id: string pausedExecutionId: string @@ -171,44 +186,81 @@ export class PauseResumeManager { const nextResumeAt = computeEarliestResumeAt(pausePoints) const now = new Date() + const metadata = { + pauseScope: 'execution', + triggerIds: snapshotSeed.triggerIds, + executorUserId: executorUserId ?? null, + } - await db - .insert(pausedExecutions) - .values({ - id: generateId(), - workflowId, - executionId, - executionSnapshot: snapshotSeed, - pausePoints: pausePointsRecord, - totalPauseCount: pausePoints.length, - resumedCount: 0, - status: 'paused', - metadata: { - pauseScope: 'execution', - triggerIds: snapshotSeed.triggerIds, - executorUserId: executorUserId ?? null, - }, - pausedAt: now, - updatedAt: now, - nextResumeAt, - }) - .onConflictDoUpdate({ - target: pausedExecutions.executionId, - set: { + await db.transaction(async (tx) => { + const existing = await tx + .select() + .from(pausedExecutions) + .where(eq(pausedExecutions.executionId, executionId)) + .for('update') + .limit(1) + .then((rows) => rows[0]) + + if (!existing) { + await tx.insert(pausedExecutions).values({ + id: generateId(), + workflowId, + executionId, executionSnapshot: snapshotSeed, pausePoints: pausePointsRecord, totalPauseCount: pausePoints.length, resumedCount: 0, status: 'paused', - metadata: { - pauseScope: 'execution', - triggerIds: snapshotSeed.triggerIds, - executorUserId: executorUserId ?? null, - }, + metadata, + pausedAt: now, updatedAt: now, nextResumeAt, - }, - }) + }) + return + } + + const existingPausePoints = (existing.pausePoints as Record) ?? {} + const mergedPausePoints = Object.fromEntries( + Object.entries(existingPausePoints).map(([contextId, point]) => [ + contextId, + point?.resumeStatus === 'resuming' + ? { ...point, resumeStatus: 'resumed', resumedAt: now.toISOString() } + : point, + ]) + ) + + for (const [contextId, point] of Object.entries(pausePointsRecord)) { + mergedPausePoints[contextId] = point + } + + const mergedPoints = Object.values(mergedPausePoints) + const resumedCount = mergedPoints.filter((point) => point?.resumeStatus === 'resumed').length + const totalPauseCount = mergedPoints.length + const mergedNextResumeAt = computeEarliestResumeAt(mergedPoints as PausePoint[]) + const nextStatus = + existing.status === 'cancelling' + ? 'cancelling' + : totalPauseCount > 0 && resumedCount >= totalPauseCount + ? 'fully_resumed' + : resumedCount > 0 + ? 'partially_resumed' + : 'paused' + + await tx + .update(pausedExecutions) + .set({ + workflowId, + executionSnapshot: snapshotSeed, + pausePoints: mergedPausePoints, + totalPauseCount, + resumedCount, + status: nextStatus, + metadata, + updatedAt: now, + nextResumeAt: mergedNextResumeAt, + }) + .where(eq(pausedExecutions.id, existing.id)) + }) await PauseResumeManager.processQueuedResumes(executionId) } @@ -229,8 +281,8 @@ export class PauseResumeManager { throw new Error('Paused execution not found or already resumed') } - if (pausedExecution.status === 'cancelled') { - throw new Error('Execution has been cancelled') + if (!isResumablePausedStatus(pausedExecution.status)) { + throw new Error('Paused execution is not resumable') } const pausePoints = pausedExecution.pausePoints as Record @@ -407,31 +459,64 @@ export class PauseResumeManager { } } } else { - await PauseResumeManager.updateSnapshotAfterResume({ - pausedExecutionId: pausedExecution.id, - contextId, - pauseBlockId: pauseBlockId, - }) + if (result.status === 'cancelled') { + await PauseResumeManager.markResumeAttemptFailed({ + resumeEntryId, + pausedExecutionId: pausedExecution.id, + parentExecutionId: pausedExecution.executionId, + contextId, + failureReason: 'Resume execution cancelled', + }) + const pausedCancellationStatus = await PauseResumeManager.getPausedCancellationStatus( + pausedExecution.executionId + ) + if (pausedCancellationStatus === 'cancelling') { + await PauseResumeManager.completePausedCancellation(pausedExecution.executionId) + } + } else { + await PauseResumeManager.updateSnapshotAfterResume({ + pausedExecutionId: pausedExecution.id, + contextId, + pauseBlockId: pauseBlockId, + executionState: result.executionState, + }) + await PauseResumeManager.markResumeCompleted({ + resumeEntryId, + pausedExecutionId: pausedExecution.id, + parentExecutionId: pausedExecution.executionId, + contextId, + }) + } } - await PauseResumeManager.markResumeCompleted({ - resumeEntryId, - pausedExecutionId: pausedExecution.id, - parentExecutionId: pausedExecution.executionId, - contextId, - }) + if (result.status === 'paused') { + await PauseResumeManager.markResumeCompleted({ + resumeEntryId, + }) + } await PauseResumeManager.processQueuedResumes(pausedExecution.executionId) return result } catch (error) { - await PauseResumeManager.markResumeFailed({ - resumeEntryId, - pausedExecutionId: pausedExecution.id, - parentExecutionId: pausedExecution.executionId, - contextId, - failureReason: (error as Error).message, - }) + const message = toError(error).message + if (message === RUN_BUFFER_UNAVAILABLE_ERROR || message === TERMINAL_PUBLISH_ERROR) { + await PauseResumeManager.markResumeAttemptFailed({ + resumeEntryId, + pausedExecutionId: pausedExecution.id, + parentExecutionId: pausedExecution.executionId, + contextId, + failureReason: message, + }) + } else { + await PauseResumeManager.markResumeFailed({ + resumeEntryId, + pausedExecutionId: pausedExecution.id, + parentExecutionId: pausedExecution.executionId, + contextId, + failureReason: message, + }) + } logger.error('Resume execution failed', { parentExecutionId: pausedExecution.executionId, resumeExecutionId, @@ -542,6 +627,8 @@ export class PauseResumeManager { executedBlocksCount: stateCopy?.executedBlocks?.length ?? 0, }) + let terminalResumeOutput: Record | undefined + if (stateCopy) { const dagIncomingEdges: Record | undefined = stateCopy.dagIncomingEdges || dagIncomingEdgesFromSnapshot @@ -654,6 +741,7 @@ export class PauseResumeManager { } pauseBlockState.output = mergedOutput + terminalResumeOutput = mergedOutput pauseBlockState.executed = true pauseBlockState.executionTime = pauseDurationMs if (stateBlockKey !== pauseBlockId && stateCopy.blockStates[pauseBlockId]) { @@ -770,6 +858,7 @@ export class PauseResumeManager { stateCopy.completedPauseContexts = Array.from(completedPauseContexts) stateCopy.remainingEdges = edgesToRemove stateCopy.pendingQueue = [] // Let the engine determine what's ready after removing edges + stateCopy.resumeTerminalNoop = edgesToRemove.length === 0 logger.info('Updated pause block state for resume', { pauseBlockId, @@ -791,6 +880,7 @@ export class PauseResumeManager { useDraftState: baseSnapshot.metadata.useDraftState, isClientSession: baseSnapshot.metadata.isClientSession, resumeFromSnapshot: true, + resumeTerminalNoop: stateCopy?.resumeTerminalNoop === true, } const resumeSnapshot = new ExecutionSnapshot( @@ -869,22 +959,44 @@ export class PauseResumeManager { }) const workflowId = pausedExecution.workflowId + const bufferReset = await resetExecutionStreamBuffer(resumeExecutionId) + if (!bufferReset) { + throw new Error(RUN_BUFFER_UNAVAILABLE_ERROR) + } + const eventWriter = createExecutionEventWriter(resumeExecutionId) - await setExecutionMeta(resumeExecutionId, { - status: 'active', + const metaInitialized = await initializeExecutionStreamMeta(resumeExecutionId, { userId: metadata.userId, workflowId, }) + if (!metaInitialized) { + throw new Error(RUN_BUFFER_UNAVAILABLE_ERROR) + } - let localEventSeq = 0 - const writeBufferedEvent = (event: ExecutionEvent) => { - localEventSeq++ - event.eventId = localEventSeq - eventWriter.write(event).catch(() => {}) + let terminalEventPublished = false + const writeBufferedEvent = async ( + event: ExecutionEvent, + terminalStatus?: TerminalExecutionStreamStatus + ) => { + const isBuffered = event.type !== 'stream:chunk' && event.type !== 'stream:done' + if (isBuffered) { + const entry = terminalStatus + ? await eventWriter.writeTerminal(event, terminalStatus).catch((error) => { + logger.warn('Failed to publish resume terminal event', { + resumeExecutionId, + status: terminalStatus, + error: toError(error).message, + }) + throw new Error(TERMINAL_PUBLISH_ERROR) + }) + : await eventWriter.write(event) + event.eventId = entry.eventId + terminalEventPublished ||= Boolean(terminalStatus) + } sendEvent?.(event) } - writeBufferedEvent({ + await writeBufferedEvent({ type: 'execution:started', timestamp: new Date().toISOString(), executionId: resumeExecutionId, @@ -901,7 +1013,7 @@ export class PauseResumeManager { iterationContext?: IterationContext, childWorkflowContext?: ChildWorkflowContext ) => { - writeBufferedEvent({ + await writeBufferedEvent({ type: 'block:started', timestamp: new Date().toISOString(), executionId: resumeExecutionId, @@ -964,7 +1076,7 @@ export class PauseResumeManager { : {}), } - writeBufferedEvent({ + await writeBufferedEvent({ type: hasError ? 'block:error' : 'block:completed', timestamp: new Date().toISOString(), executionId: resumeExecutionId, @@ -976,13 +1088,13 @@ export class PauseResumeManager { await externalOnBlockComplete(blockId, callbackData.output) } }, - onChildWorkflowInstanceReady: ( + onChildWorkflowInstanceReady: async ( blockId: string, childWorkflowInstanceId: string, iterationContext?: IterationContext, executionOrder?: number ) => { - writeBufferedEvent({ + await writeBufferedEvent({ type: 'block:childWorkflowStarted', timestamp: new Date().toISOString(), executionId: resumeExecutionId, @@ -1015,7 +1127,7 @@ export class PauseResumeManager { const { done, value } = await reader.read() if (done) break const chunk = decoder.decode(value, { stream: true }) - writeBufferedEvent({ + await writeBufferedEvent({ type: 'stream:chunk', timestamp: new Date().toISOString(), executionId: resumeExecutionId, @@ -1023,7 +1135,7 @@ export class PauseResumeManager { data: { blockId, chunk }, } as ExecutionEvent) } - writeBufferedEvent({ + await writeBufferedEvent({ type: 'stream:done', timestamp: new Date().toISOString(), executionId: resumeExecutionId, @@ -1048,8 +1160,9 @@ export class PauseResumeManager { ? null : createTimeoutAbortController(preprocessingResult.executionTimeout?.async) - let result: ExecutionResult - let finalMetaStatus: 'complete' | 'error' | 'cancelled' = 'complete' + let result: ExecutionResult | undefined + let finalMetaStatus: TerminalExecutionStreamStatus = 'complete' + let executionError: unknown try { result = await executeWorkflowCore({ snapshot: resumeSnapshot, @@ -1061,6 +1174,13 @@ export class PauseResumeManager { abortSignal: externalAbortSignal ?? timeoutController?.signal, }) + if (resumeSnapshot.metadata.resumeTerminalNoop === true && result.status !== 'cancelled') { + result = { + ...result, + output: terminalResumeOutput ?? result.output, + } + } + if ( result.status === 'cancelled' && timeoutController?.isTimedOut() && @@ -1073,87 +1193,119 @@ export class PauseResumeManager { }) await loggingSession.markAsFailed(timeoutErrorMessage) - writeBufferedEvent({ - type: 'execution:error', - timestamp: new Date().toISOString(), - executionId: resumeExecutionId, - workflowId, - data: { - error: timeoutErrorMessage, - duration: result.metadata?.duration || 0, - finalBlockLogs: result.logs, - }, - } as ExecutionEvent) finalMetaStatus = 'error' - } else if (result.status === 'cancelled') { - writeBufferedEvent({ - type: 'execution:cancelled', - timestamp: new Date().toISOString(), - executionId: resumeExecutionId, - workflowId, - data: { - duration: result.metadata?.duration || 0, - finalBlockLogs: result.logs, + await writeBufferedEvent( + { + type: 'execution:error', + timestamp: new Date().toISOString(), + executionId: resumeExecutionId, + workflowId, + data: { + error: timeoutErrorMessage, + duration: result.metadata?.duration || 0, + finalBlockLogs: result.logs, + }, }, - } as ExecutionEvent) + 'error' + ) + } else if (result.status === 'cancelled') { finalMetaStatus = 'cancelled' - } else if (result.status === 'paused') { - writeBufferedEvent({ - type: 'execution:paused', - timestamp: new Date().toISOString(), - executionId: resumeExecutionId, - workflowId, - data: { - output: result.output, - duration: result.metadata?.duration || 0, - startTime: result.metadata?.startTime || new Date().toISOString(), - endTime: result.metadata?.endTime || new Date().toISOString(), + await writeBufferedEvent( + { + type: 'execution:cancelled', + timestamp: new Date().toISOString(), + executionId: resumeExecutionId, + workflowId, + data: { + duration: result.metadata?.duration || 0, + finalBlockLogs: result.logs, + }, }, - } as ExecutionEvent) + 'cancelled' + ) + } else if (result.status === 'paused') { finalMetaStatus = 'complete' + await writeBufferedEvent( + { + type: 'execution:paused', + timestamp: new Date().toISOString(), + executionId: resumeExecutionId, + workflowId, + data: { + output: result.output, + duration: result.metadata?.duration || 0, + startTime: result.metadata?.startTime || new Date().toISOString(), + endTime: result.metadata?.endTime || new Date().toISOString(), + finalBlockLogs: result.logs, + }, + }, + 'complete' + ) } else { - writeBufferedEvent({ - type: 'execution:completed', + finalMetaStatus = 'complete' + await writeBufferedEvent( + { + type: 'execution:completed', + timestamp: new Date().toISOString(), + executionId: resumeExecutionId, + workflowId, + data: { + success: result.success, + output: result.output, + duration: result.metadata?.duration || 0, + startTime: result.metadata?.startTime || new Date().toISOString(), + endTime: result.metadata?.endTime || new Date().toISOString(), + finalBlockLogs: result.logs, + }, + }, + 'complete' + ) + } + } catch (execError) { + executionError = execError + const execErrorResult = hasExecutionResult(execError) ? execError.executionResult : undefined + finalMetaStatus = 'error' + await writeBufferedEvent( + { + type: 'execution:error', timestamp: new Date().toISOString(), executionId: resumeExecutionId, workflowId, data: { - success: result.success, - output: result.output, - duration: result.metadata?.duration || 0, - startTime: result.metadata?.startTime || new Date().toISOString(), - endTime: result.metadata?.endTime || new Date().toISOString(), - finalBlockLogs: result.logs, + error: toError(execError).message, + duration: 0, + finalBlockLogs: execErrorResult?.logs, }, - } as ExecutionEvent) - finalMetaStatus = 'complete' - } - } catch (execError) { - const execErrorResult = hasExecutionResult(execError) ? execError.executionResult : undefined - writeBufferedEvent({ - type: 'execution:error', - timestamp: new Date().toISOString(), - executionId: resumeExecutionId, - workflowId, - data: { - error: toError(execError).message, - duration: 0, - finalBlockLogs: execErrorResult?.logs, }, - } as ExecutionEvent) - finalMetaStatus = 'error' - throw execError + 'error' + ) } finally { timeoutController?.cleanup() - try { - await eventWriter.close() - } catch (closeError) { - logger.warn('Failed to close event writer for resume', { + if (!terminalEventPublished) { + const replayBufferFlushed = await flushExecutionStreamReplayBuffer( resumeExecutionId, - error: toError(closeError).message, + eventWriter + ) + logger.warn('Failed to publish resume terminal event durably', { + resumeExecutionId, + status: finalMetaStatus, + replayBufferFlushed, + }) + if (!executionError) { + executionError = new Error(TERMINAL_PUBLISH_ERROR) + } + } else { + await eventWriter.close().catch((error) => { + logger.warn('Failed to close resume event writer after terminal publish', { + resumeExecutionId, + error: toError(error).message, + }) }) } - setExecutionMeta(resumeExecutionId, { status: finalMetaStatus }).catch(() => {}) + } + + if (executionError || !result) { + throw executionError ?? new Error('Resume execution did not produce a result') } return result @@ -1161,9 +1313,9 @@ export class PauseResumeManager { private static async markResumeCompleted(args: { resumeEntryId: string - pausedExecutionId: string - parentExecutionId: string - contextId: string + pausedExecutionId?: string + parentExecutionId?: string + contextId?: string }): Promise { const { resumeEntryId, pausedExecutionId, parentExecutionId, contextId } = args const now = new Date() @@ -1174,31 +1326,48 @@ export class PauseResumeManager { .set({ status: 'completed', completedAt: now, failureReason: null }) .where(eq(resumeQueue.id, resumeEntryId)) + if (!pausedExecutionId || !parentExecutionId || !contextId) { + return + } + await tx .update(pausedExecutions) .set({ pausePoints: sql`jsonb_set(jsonb_set(pause_points, ARRAY[${contextId}, 'resumeStatus'], '"resumed"'::jsonb), ARRAY[${contextId}, 'resumedAt'], '"${sql.raw(now.toISOString())}"'::jsonb)`, resumedCount: sql`resumed_count + 1`, - status: sql`CASE WHEN resumed_count + 1 >= total_pause_count THEN 'fully_resumed' ELSE 'partially_resumed' END`, + status: sql`CASE WHEN status = 'cancelling' THEN 'cancelling' WHEN resumed_count + 1 >= total_pause_count THEN 'fully_resumed' ELSE 'partially_resumed' END`, updatedAt: now, }) .where(eq(pausedExecutions.id, pausedExecutionId)) const [{ remaining }] = await tx - .select({ remaining: sql`total_pause_count - resumed_count - 1` }) + .select({ remaining: sql`total_pause_count - resumed_count` }) .from(pausedExecutions) .where(eq(pausedExecutions.executionId, parentExecutionId)) if (Number(remaining) <= 0) { await tx .update(pausedExecutions) - .set({ status: 'fully_resumed', updatedAt: now }) + .set({ + status: sql`CASE WHEN status = 'cancelling' THEN 'cancelling' ELSE 'fully_resumed' END`, + updatedAt: now, + }) .where(eq(pausedExecutions.executionId, parentExecutionId)) } else { await tx .update(workflowExecutionLogs) .set({ status: 'pending' }) - .where(eq(workflowExecutionLogs.executionId, parentExecutionId)) + .where( + and( + eq(workflowExecutionLogs.executionId, parentExecutionId), + sql`${workflowExecutionLogs.status} != 'cancelled'`, + sql`NOT EXISTS ( + SELECT 1 FROM ${pausedExecutions} + WHERE ${pausedExecutions.executionId} = ${parentExecutionId} + AND ${pausedExecutions.status} = 'cancelling' + )` + ) + ) } }) } @@ -1232,12 +1401,53 @@ export class PauseResumeManager { }) } + static async markResumeAttemptFailed(args: { + resumeEntryId: string + pausedExecutionId: string + parentExecutionId: string + contextId: string + failureReason: string + }): Promise { + const now = new Date() + + await db.transaction(async (tx) => { + await tx + .update(resumeQueue) + .set({ status: 'failed', failureReason: args.failureReason, completedAt: now }) + .where(eq(resumeQueue.id, args.resumeEntryId)) + + await tx + .update(pausedExecutions) + .set({ + pausePoints: sql`jsonb_set(pause_points, ARRAY[${args.contextId}, 'resumeStatus'], '"paused"'::jsonb)`, + status: sql`CASE WHEN status = 'cancelling' THEN 'cancelling' ELSE status END`, + updatedAt: now, + }) + .where(eq(pausedExecutions.id, args.pausedExecutionId)) + + await tx + .update(workflowExecutionLogs) + .set({ status: sql`CASE WHEN status = 'cancelled' THEN 'cancelled' ELSE 'paused' END` }) + .where( + and( + eq(workflowExecutionLogs.executionId, args.parentExecutionId), + sql`NOT EXISTS ( + SELECT 1 FROM ${pausedExecutions} + WHERE ${pausedExecutions.id} = ${args.pausedExecutionId} + AND ${pausedExecutions.status} = 'cancelling' + )` + ) + ) + }) + } + private static async updateSnapshotAfterResume(args: { pausedExecutionId: string contextId: string pauseBlockId: string + executionState?: SerializableExecutionState }): Promise { - const { pausedExecutionId, contextId, pauseBlockId } = args + const { pausedExecutionId, contextId, pauseBlockId, executionState } = args const pausedExecution = await db .select() @@ -1253,6 +1463,9 @@ export class PauseResumeManager { const currentSnapshot = pausedExecution.executionSnapshot as SerializedSnapshot const snapshotData = JSON.parse(currentSnapshot.snapshot) + if (executionState) { + snapshotData.state = executionState + } // Update the DAG incoming edges in the snapshot // Remove the edge from the resumed pause block @@ -1313,12 +1526,7 @@ export class PauseResumeManager { }) } - /** - * Cancels a paused execution by updating both the paused execution record and the - * workflow execution log status to 'cancelled'. Returns true if a paused execution - * was found and cancelled, false if no paused execution exists for this executionId. - */ - static async cancelPausedExecution(executionId: string): Promise { + static async beginPausedCancellation(executionId: string): Promise { const now = new Date() return await db.transaction(async (tx) => { @@ -1326,7 +1534,10 @@ export class PauseResumeManager { .select({ id: pausedExecutions.id }) .from(pausedExecutions) .where( - and(eq(pausedExecutions.executionId, executionId), eq(pausedExecutions.status, 'paused')) + and( + eq(pausedExecutions.executionId, executionId), + inArray(pausedExecutions.status, [...CANCELLABLE_PAUSED_STATUSES, 'cancelling']) + ) ) .for('update') .limit(1) @@ -1336,6 +1547,51 @@ export class PauseResumeManager { return false } + const activeResume = await tx + .select({ id: resumeQueue.id }) + .from(resumeQueue) + .where( + and(eq(resumeQueue.parentExecutionId, executionId), eq(resumeQueue.status, 'claimed')) + ) + .limit(1) + .then((rows) => rows[0]) + + if (activeResume) { + await tx + .update(pausedExecutions) + .set({ status: 'cancelling', updatedAt: now }) + .where(eq(pausedExecutions.id, pausedExecution.id)) + return false + } + + await tx + .update(pausedExecutions) + .set({ status: 'cancelling', updatedAt: now }) + .where(eq(pausedExecutions.id, pausedExecution.id)) + + return true + }) + } + + static async completePausedCancellation(executionId: string): Promise { + const now = new Date() + + return await db.transaction(async (tx) => { + const pausedExecution = await tx + .select({ id: pausedExecutions.id, status: pausedExecutions.status }) + .from(pausedExecutions) + .where(eq(pausedExecutions.executionId, executionId)) + .for('update') + .limit(1) + .then((rows) => rows[0]) + + if (!pausedExecution || pausedExecution.status !== 'cancelling') { + if (pausedExecution?.status === 'cancelled') { + return true + } + return false + } + await tx .update(pausedExecutions) .set({ status: 'cancelled', updatedAt: now }) @@ -1350,6 +1606,91 @@ export class PauseResumeManager { }) } + static async blockQueuedResumesForCancellation(executionId: string): Promise { + const now = new Date() + + return await db.transaction(async (tx) => { + const pausedExecution = await tx + .select({ id: pausedExecutions.id }) + .from(pausedExecutions) + .where( + and( + eq(pausedExecutions.executionId, executionId), + inArray(pausedExecutions.status, [...CANCELLABLE_PAUSED_STATUSES, 'cancelling']) + ) + ) + .for('update') + .limit(1) + .then((rows) => rows[0]) + + if (!pausedExecution) { + return false + } + + await tx + .update(pausedExecutions) + .set({ status: 'cancelling', updatedAt: now }) + .where(eq(pausedExecutions.id, pausedExecution.id)) + + await tx + .update(resumeQueue) + .set({ + status: 'failed', + completedAt: now, + failureReason: 'Paused execution cancellation requested', + }) + .where( + and(eq(resumeQueue.parentExecutionId, executionId), eq(resumeQueue.status, 'pending')) + ) + + return true + }) + } + + static async clearPausedCancellationIntent(executionId: string): Promise { + const now = new Date() + await db + .update(pausedExecutions) + .set({ + status: sql`CASE WHEN resumed_count > 0 THEN 'partially_resumed' ELSE 'paused' END`, + updatedAt: now, + }) + .where( + and( + eq(pausedExecutions.executionId, executionId), + eq(pausedExecutions.status, 'cancelling') + ) + ) + await PauseResumeManager.processQueuedResumes(executionId) + } + + static async getPausedCancellationStatus( + executionId: string + ): Promise<'cancelling' | 'cancelled' | null> { + const activeResume = await db + .select({ id: resumeQueue.id }) + .from(resumeQueue) + .where(and(eq(resumeQueue.parentExecutionId, executionId), eq(resumeQueue.status, 'claimed'))) + .limit(1) + .then((rows) => rows[0]) + + if (activeResume) { + return null + } + + const pausedExecution = await db + .select({ status: pausedExecutions.status }) + .from(pausedExecutions) + .where(eq(pausedExecutions.executionId, executionId)) + .limit(1) + .then((rows) => rows[0]) + + if (pausedExecution?.status === 'cancelling' || pausedExecution?.status === 'cancelled') { + return pausedExecution.status + } + return null + } + /** * Updates `next_resume_at` only when the row is still `status='paused'`. * Guard prevents the cron poller from clobbering a freshly-written value when a @@ -1498,52 +1839,100 @@ export class PauseResumeManager { } static async processQueuedResumes(parentExecutionId: string): Promise { - const pendingEntry = await db.transaction(async (tx) => { - const entry = await tx - .select() - .from(resumeQueue) - .where( - and( - eq(resumeQueue.parentExecutionId, parentExecutionId), - eq(resumeQueue.status, 'pending') + let pendingEntry: { + entry: typeof resumeQueue.$inferSelect + pausedExecution: typeof pausedExecutions.$inferSelect + } | null = null + + while (!pendingEntry) { + const selection = await db.transaction(async (tx) => { + const pausedExecution = await tx + .select() + .from(pausedExecutions) + .where(eq(pausedExecutions.executionId, parentExecutionId)) + .for('update') + .limit(1) + .then((rows) => rows[0]) + + if (!pausedExecution || !isResumablePausedStatus(pausedExecution.status)) { + return { action: 'empty' as const } + } + + const activeResume = await tx + .select({ id: resumeQueue.id }) + .from(resumeQueue) + .where( + and( + eq(resumeQueue.parentExecutionId, parentExecutionId), + eq(resumeQueue.status, 'claimed') + ) ) - ) - .orderBy(asc(resumeQueue.queuedAt)) - .limit(1) - .then((rows) => rows[0]) + .limit(1) + .then((rows) => rows[0]) - if (!entry) { - return null - } + if (activeResume) { + return { action: 'active' as const } + } - await tx - .update(resumeQueue) - .set({ status: 'claimed', claimedAt: new Date() }) - .where(eq(resumeQueue.id, entry.id)) + const entry = await tx + .select() + .from(resumeQueue) + .where( + and( + eq(resumeQueue.parentExecutionId, parentExecutionId), + eq(resumeQueue.status, 'pending') + ) + ) + .orderBy(asc(resumeQueue.queuedAt)) + .limit(1) + .for('update') + .then((rows) => rows[0]) - const pausedExecution = await tx - .select() - .from(pausedExecutions) - .where(eq(pausedExecutions.id, entry.pausedExecutionId)) - .limit(1) - .then((rows) => rows[0]) + if (!entry) { + return { action: 'empty' as const } + } - if (!pausedExecution) { - return null - } + const pausePoints = pausedExecution.pausePoints as Record + const pausePoint = pausePoints?.[entry.contextId] + if (!pausePoint || pausePoint.resumeStatus !== 'queued') { + await tx + .update(resumeQueue) + .set({ + status: 'failed', + completedAt: new Date(), + failureReason: 'Pause point is no longer queued', + }) + .where(eq(resumeQueue.id, entry.id)) + return { action: 'continue' as const } + } - await tx - .update(pausedExecutions) - .set({ - pausePoints: sql`jsonb_set(pause_points, ARRAY[${entry.contextId}, 'resumeStatus'], '"resuming"'::jsonb)`, - }) - .where(eq(pausedExecutions.id, pausedExecution.id)) + await tx + .update(resumeQueue) + .set({ status: 'claimed', claimedAt: new Date() }) + .where(eq(resumeQueue.id, entry.id)) - return { entry, pausedExecution } - }) + await tx + .update(pausedExecutions) + .set({ + pausePoints: sql`jsonb_set(pause_points, ARRAY[${entry.contextId}, 'resumeStatus'], '"resuming"'::jsonb)`, + }) + .where(eq(pausedExecutions.id, pausedExecution.id)) - if (!pendingEntry) { - return + return { action: 'claimed' as const, entry, pausedExecution } + }) + + if (selection.action === 'empty') { + return + } + if (selection.action === 'active') { + return + } + if (selection.action === 'claimed') { + pendingEntry = { + entry: selection.entry, + pausedExecution: selection.pausedExecution, + } + } } const { entry, pausedExecution } = pendingEntry diff --git a/apps/sim/lib/workflows/executor/queued-workflow-execution.ts b/apps/sim/lib/workflows/executor/queued-workflow-execution.ts index 2dc0fc85318..06a851a3b53 100644 --- a/apps/sim/lib/workflows/executor/queued-workflow-execution.ts +++ b/apps/sim/lib/workflows/executor/queued-workflow-execution.ts @@ -1,7 +1,12 @@ import { createLogger } from '@sim/logger' import { toError } from '@sim/utils/errors' import { createTimeoutAbortController, getTimeoutErrorMessage } from '@/lib/core/execution-limits' -import { createExecutionEventWriter, setExecutionMeta } from '@/lib/execution/event-buffer' +import { + createExecutionEventWriter, + type ExecutionEventWriter, + initializeExecutionStreamMeta, + type TerminalExecutionStreamStatus, +} from '@/lib/execution/event-buffer' import { LoggingSession } from '@/lib/logs/execution/logging-session' import { buildTraceSpans } from '@/lib/logs/execution/trace-spans/trace-spans' import { @@ -23,6 +28,7 @@ import type { BlockLog, NormalizedBlockOutput } from '@/executor/types' import { hasExecutionResult } from '@/executor/utils/errors' const logger = createLogger('QueuedWorkflowExecution') +const TERMINAL_PUBLISH_ERROR = 'Run buffer terminal event publish failed' export const DIRECT_WORKFLOW_JOB_NAME = 'direct-workflow-execution' @@ -86,23 +92,46 @@ function buildResult( } } +async function publishTerminalExecutionEvent(params: { + writer: ExecutionEventWriter + executionId: string + status: TerminalExecutionStreamStatus + event: ExecutionEvent +}): Promise { + try { + await params.writer.writeTerminal(params.event, params.status) + return true + } catch (error) { + logger.warn('Failed to buffer terminal execution event', { + executionId: params.executionId, + status: params.status, + error: toError(error).message, + }) + return false + } +} + export async function executeQueuedWorkflowJob( payload: QueuedWorkflowExecutionPayload ): Promise { const { metadata } = payload const { executionId, requestId, workflowId, triggerType } = metadata const loggingSession = new LoggingSession(workflowId, executionId, triggerType, requestId) - const timeoutController = createTimeoutAbortController(payload.timeoutMs) const eventWriter = payload.streamEvents ? createExecutionEventWriter(executionId) : null + let eventWriterClosed = false if (payload.streamEvents) { - await setExecutionMeta(executionId, { - status: 'active', + const metaInitialized = await initializeExecutionStreamMeta(executionId, { userId: metadata.userId, workflowId, }) + if (!metaInitialized) { + throw new Error('Run buffer temporarily unavailable') + } } + const timeoutController = createTimeoutAbortController(payload.timeoutMs) + try { const snapshot = new ExecutionSnapshot( metadata, @@ -161,19 +190,25 @@ export async function executeQueuedWorkflowJob( await loggingSession.markAsFailed(timeoutErrorMessage) if (eventWriter) { - await eventWriter.write({ - type: 'execution:error', - timestamp: new Date().toISOString(), + eventWriterClosed = await publishTerminalExecutionEvent({ + writer: eventWriter, executionId, - workflowId, - data: { - error: timeoutErrorMessage, - duration: result.metadata?.duration || 0, - finalBlockLogs: result.logs, + status: 'error', + event: { + type: 'execution:error', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + error: timeoutErrorMessage, + duration: result.metadata?.duration || 0, + finalBlockLogs: result.logs, + }, }, }) - - await setExecutionMeta(executionId, { status: 'error' }) + } + if (eventWriter && !eventWriterClosed) { + throw new Error(TERMINAL_PUBLISH_ERROR) } return buildResult( @@ -208,49 +243,65 @@ export async function executeQueuedWorkflowJob( if (eventWriter) { if (result.status === 'cancelled') { - await eventWriter.write({ - type: 'execution:cancelled', - timestamp: new Date().toISOString(), + eventWriterClosed = await publishTerminalExecutionEvent({ + writer: eventWriter, executionId, - workflowId, - data: { - duration: result.metadata?.duration || 0, - finalBlockLogs: result.logs, + status: 'cancelled', + event: { + type: 'execution:cancelled', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + duration: result.metadata?.duration || 0, + finalBlockLogs: result.logs, + }, }, }) - await setExecutionMeta(executionId, { status: 'cancelled' }) } else if (result.status === 'paused') { - await eventWriter.write({ - type: 'execution:paused', - timestamp: new Date().toISOString(), + eventWriterClosed = await publishTerminalExecutionEvent({ + writer: eventWriter, executionId, - workflowId, - data: { - output: outputWithBase64, - duration: result.metadata?.duration || 0, - startTime: result.metadata?.startTime || metadata.startTime, - endTime: result.metadata?.endTime || new Date().toISOString(), + status: 'complete', + event: { + type: 'execution:paused', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + output: outputWithBase64, + duration: result.metadata?.duration || 0, + startTime: result.metadata?.startTime || metadata.startTime, + endTime: result.metadata?.endTime || new Date().toISOString(), + finalBlockLogs: result.logs, + }, }, }) - await setExecutionMeta(executionId, { status: 'complete' }) } else { - await eventWriter.write({ - type: 'execution:completed', - timestamp: new Date().toISOString(), + eventWriterClosed = await publishTerminalExecutionEvent({ + writer: eventWriter, executionId, - workflowId, - data: { - success: result.success, - output: outputWithBase64, - duration: result.metadata?.duration || 0, - startTime: result.metadata?.startTime || metadata.startTime, - endTime: result.metadata?.endTime || new Date().toISOString(), - finalBlockLogs: result.logs, + status: 'complete', + event: { + type: 'execution:completed', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + success: result.success, + output: outputWithBase64, + duration: result.metadata?.duration || 0, + startTime: result.metadata?.startTime || metadata.startTime, + endTime: result.metadata?.endTime || new Date().toISOString(), + finalBlockLogs: result.logs, + }, }, }) - await setExecutionMeta(executionId, { status: 'complete' }) } } + if (eventWriter && !eventWriterClosed) { + throw new Error(TERMINAL_PUBLISH_ERROR) + } return buildResult( result.status === 'paused' @@ -274,6 +325,10 @@ export async function executeQueuedWorkflowJob( executionId ) } catch (error) { + if (toError(error).message === TERMINAL_PUBLISH_ERROR) { + throw error + } + logger.error('Queued workflow execution failed', { workflowId, executionId, @@ -295,18 +350,25 @@ export async function executeQueuedWorkflowJob( const executionResult = hasExecutionResult(error) ? error.executionResult : undefined if (eventWriter) { - await eventWriter.write({ - type: 'execution:error', - timestamp: new Date().toISOString(), + eventWriterClosed = await publishTerminalExecutionEvent({ + writer: eventWriter, executionId, - workflowId, - data: { - error: toError(error).message, - duration: 0, - finalBlockLogs: executionResult?.logs, + status: 'error', + event: { + type: 'execution:error', + timestamp: new Date().toISOString(), + executionId, + workflowId, + data: { + error: toError(error).message, + duration: 0, + finalBlockLogs: executionResult?.logs, + }, }, }) - await setExecutionMeta(executionId, { status: 'error' }) + } + if (eventWriter && !eventWriterClosed) { + throw new Error(TERMINAL_PUBLISH_ERROR) } return buildResult( @@ -330,8 +392,13 @@ export async function executeQueuedWorkflowJob( } finally { timeoutController.cleanup() - if (eventWriter) { - await eventWriter.close() + if (eventWriter && !eventWriterClosed) { + await eventWriter.close().catch((error) => { + logger.warn('Failed to close queued execution event writer', { + executionId, + error: toError(error).message, + }) + }) } await cleanupExecutionBase64Cache(executionId).catch((error) => { diff --git a/apps/sim/stores/terminal/console/storage.ts b/apps/sim/stores/terminal/console/storage.ts index 35f3eb91167..4ee136db81b 100644 --- a/apps/sim/stores/terminal/console/storage.ts +++ b/apps/sim/stores/terminal/console/storage.ts @@ -10,7 +10,7 @@ const MIGRATION_KEY = 'terminal-console-store-migrated' /** * Interval for persisting terminal state during active executions. * Kept short enough that a hard refresh during execution still has - * recent running entries persisted for the reconnect flow to find. + * recent rows available once the tab-local reconnect pointer resumes. */ const EXECUTION_PERSIST_INTERVAL_MS = 5_000 @@ -94,16 +94,84 @@ export async function loadConsoleData(): Promise { } } -let writeSequence = 0 let activeWrite: Promise | null = null -function writeToIndexedDB(data: PersistedConsoleData): void { - const seq = ++writeSequence +interface PersistOptions { + merge?: boolean +} + +function entryTimestamp(entry: ConsoleEntry): number { + return Date.parse(entry.endedAt ?? entry.startedAt ?? entry.timestamp) +} + +function shouldReplaceEntry(existing: ConsoleEntry, incoming: ConsoleEntry): boolean { + if (existing.isRunning && !incoming.isRunning) return true + if (!existing.isRunning && incoming.isRunning) return false + return entryTimestamp(incoming) >= entryTimestamp(existing) +} + +function mergeEntries( + existingEntries: ConsoleEntry[] = [], + incomingEntries: ConsoleEntry[] = [] +): ConsoleEntry[] { + const entriesById = new Map() + const orderedIds: string[] = [] + + for (const entry of existingEntries) { + entriesById.set(entry.id, entry) + orderedIds.push(entry.id) + } + + for (const entry of incomingEntries) { + const existing = entriesById.get(entry.id) + if (!existing) { + entriesById.set(entry.id, entry) + orderedIds.push(entry.id) + continue + } + if (shouldReplaceEntry(existing, entry)) { + entriesById.set(entry.id, entry) + } + } + + return orderedIds + .map((id) => entriesById.get(id)) + .filter((entry): entry is ConsoleEntry => !!entry) +} +function mergePersistedConsoleData( + existing: PersistedConsoleData | null, + incoming: PersistedConsoleData +): PersistedConsoleData { + if (!existing) return incoming + const workflowIds = new Set([ + ...Object.keys(existing.workflowEntries), + ...Object.keys(incoming.workflowEntries), + ]) + const workflowEntries: Record = {} + + for (const workflowId of workflowIds) { + const entries = mergeEntries( + existing.workflowEntries[workflowId], + incoming.workflowEntries[workflowId] + ) + if (entries.length > 0) workflowEntries[workflowId] = entries + } + + return { + workflowEntries, + isOpen: incoming.isOpen, + } +} + +function writeToIndexedDB( + data: PersistedConsoleData, + { merge = true }: PersistOptions = {} +): Promise { const doWrite = async () => { try { - const serialized = JSON.stringify(data) - if (seq !== writeSequence) return + const nextData = merge ? mergePersistedConsoleData(await loadConsoleData(), data) : data + const serialized = JSON.stringify(nextData) await set(STORE_KEY, serialized) } catch (error) { logger.warn('IndexedDB write failed', { error }) @@ -111,6 +179,7 @@ function writeToIndexedDB(data: PersistedConsoleData): void { } activeWrite = (activeWrite ?? Promise.resolve()).then(doWrite) + return activeWrite } /** @@ -153,8 +222,8 @@ class ConsolePersistenceManager { /** * Called by the store when a running entry is added during an active execution. - * Triggers one immediate persist so the reconnect flow can find running entries - * after a page refresh, then disables until the next execution starts. + * Triggers one immediate persist so refreshes can hydrate visible terminal rows, + * then disables until the next execution starts. */ onRunningEntryAdded(): void { if (!this.needsInitialPersist) return @@ -178,9 +247,9 @@ class ConsolePersistenceManager { * Triggers an immediate persist. Used for explicit user actions * like clearing the console, and for page-hide durability. */ - persist(): void { - if (!this.dataProvider) return - writeToIndexedDB(this.dataProvider()) + persist(options?: PersistOptions): Promise { + if (!this.dataProvider) return Promise.resolve() + return writeToIndexedDB(this.dataProvider(), options) } private startSafetyTimer(): void { @@ -205,8 +274,8 @@ const EXEC_POINTER_PREFIX = 'terminal-active-execution:' /** * Lightweight pointer to an in-flight execution, persisted immediately on * execution start so the reconnect flow can find it even if no console - * entries have been written yet. Keyed per-workflow so multiple tabs - * running different workflows don't overwrite each other. + * entries have been written yet. Stored in sessionStorage so ownership stays + * scoped to the browser tab that started the run. */ export interface ExecutionPointer { workflowId: string @@ -217,9 +286,9 @@ export interface ExecutionPointer { export async function loadExecutionPointer(workflowId: string): Promise { if (typeof window === 'undefined') return null try { - const raw = await get(`${EXEC_POINTER_PREFIX}${workflowId}`) + const raw = window.sessionStorage.getItem(`${EXEC_POINTER_PREFIX}${workflowId}`) if (!raw) return null - const parsed = typeof raw === 'string' ? JSON.parse(raw) : raw + const parsed = JSON.parse(raw) if (!parsed?.executionId) return null return parsed as ExecutionPointer } catch { @@ -227,12 +296,25 @@ export async function loadExecutionPointer(workflowId: string): Promise {}) +export function saveExecutionPointer(pointer: ExecutionPointer): Promise { + if (typeof window === 'undefined') return Promise.resolve() + try { + window.sessionStorage.setItem( + `${EXEC_POINTER_PREFIX}${pointer.workflowId}`, + JSON.stringify(pointer) + ) + } catch { + return Promise.resolve() + } + return Promise.resolve() } -export function clearExecutionPointer(workflowId: string): void { - if (typeof window === 'undefined') return - set(`${EXEC_POINTER_PREFIX}${workflowId}`, '').catch(() => {}) +export function clearExecutionPointer(workflowId: string): Promise { + if (typeof window === 'undefined') return Promise.resolve() + try { + window.sessionStorage.removeItem(`${EXEC_POINTER_PREFIX}${workflowId}`) + } catch { + return Promise.resolve() + } + return Promise.resolve() } diff --git a/apps/sim/stores/terminal/console/store.test.ts b/apps/sim/stores/terminal/console/store.test.ts index da0606eaf48..00b2b3bb3cc 100644 --- a/apps/sim/stores/terminal/console/store.test.ts +++ b/apps/sim/stores/terminal/console/store.test.ts @@ -136,5 +136,59 @@ describe('terminal console store', () => { expect(entry.isCanceled).toBe(true) expect(entry.isRunning).toBe(false) }) + + it('only cancels running entries for the requested execution when provided', () => { + useTerminalConsoleStore.getState().addConsole({ + workflowId: 'wf-1', + blockId: 'block-1', + blockName: 'Function 1', + blockType: 'function', + executionId: 'exec-1', + executionOrder: 1, + isRunning: true, + }) + useTerminalConsoleStore.getState().addConsole({ + workflowId: 'wf-1', + blockId: 'block-2', + blockName: 'Function 2', + blockType: 'function', + executionId: 'exec-2', + executionOrder: 2, + isRunning: true, + }) + + useTerminalConsoleStore.getState().cancelRunningEntries('wf-1', 'exec-1') + + const entries = useTerminalConsoleStore.getState().getWorkflowEntries('wf-1') + expect(entries.find((entry) => entry.executionId === 'exec-1')).toMatchObject({ + isCanceled: true, + isRunning: false, + }) + expect(entries.find((entry) => entry.executionId === 'exec-2')).toMatchObject({ + isRunning: true, + }) + }) + }) + + describe('finishRunningEntries', () => { + it('settles running entries without marking them canceled', () => { + useTerminalConsoleStore.getState().addConsole({ + workflowId: 'wf-1', + blockId: 'block-1', + blockName: 'Function', + blockType: 'function', + executionId: 'exec-1', + executionOrder: 1, + isRunning: true, + startedAt: new Date(Date.now() - 1000).toISOString(), + }) + + useTerminalConsoleStore.getState().finishRunningEntries('wf-1', 'exec-1') + + const [entry] = useTerminalConsoleStore.getState().getWorkflowEntries('wf-1') + expect(entry.isCanceled).toBe(false) + expect(entry.isRunning).toBe(false) + expect(entry.endedAt).toBeDefined() + }) }) }) diff --git a/apps/sim/stores/terminal/console/store.ts b/apps/sim/stores/terminal/console/store.ts index d6d8414a16a..8f666420a75 100644 --- a/apps/sim/stores/terminal/console/store.ts +++ b/apps/sim/stores/terminal/console/store.ts @@ -110,6 +110,14 @@ const matchesEntryForUpdate = ( return false } + if ( + update.childWorkflowInstanceId !== undefined && + entry.childWorkflowInstanceId !== undefined && + entry.childWorkflowInstanceId !== update.childWorkflowInstanceId + ) { + return false + } + return true } @@ -328,7 +336,7 @@ export const useTerminalConsoleStore = create()( clearWorkflowConsole: (workflowId: string) => { set((state) => replaceWorkflowEntries(state, workflowId, EMPTY_CONSOLE_ENTRIES)) useExecutionStore.getState().clearRunPath(workflowId) - consolePersistence.persist() + consolePersistence.persist({ merge: false }) }, clearExecutionEntries: (executionId: string) => @@ -604,13 +612,17 @@ export const useTerminalConsoleStore = create()( } }, - cancelRunningEntries: (workflowId: string) => { + cancelRunningEntries: (workflowId: string, executionId?: string) => { set((state) => { const now = new Date() const workflowEntries = state.workflowEntries[workflowId] ?? EMPTY_CONSOLE_ENTRIES let didChange = false const updatedEntries = workflowEntries.map((entry) => { - if (entry.workflowId === workflowId && entry.isRunning) { + if ( + entry.workflowId === workflowId && + entry.isRunning && + (executionId === undefined || entry.executionId === executionId) + ) { didChange = true const durationMs = entry.startedAt ? now.getTime() - new Date(entry.startedAt).getTime() @@ -631,6 +643,38 @@ export const useTerminalConsoleStore = create()( return replaceWorkflowEntries(state, workflowId, updatedEntries) }) }, + + finishRunningEntries: (workflowId: string, executionId?: string) => { + set((state) => { + const now = new Date() + const workflowEntries = state.workflowEntries[workflowId] ?? EMPTY_CONSOLE_ENTRIES + let didChange = false + const updatedEntries = workflowEntries.map((entry) => { + if ( + entry.workflowId === workflowId && + entry.isRunning && + (executionId === undefined || entry.executionId === executionId) + ) { + didChange = true + const durationMs = entry.startedAt + ? now.getTime() - new Date(entry.startedAt).getTime() + : entry.durationMs + return { + ...entry, + isRunning: false, + isCanceled: false, + endedAt: now.toISOString(), + durationMs, + } + } + return entry + }) + if (!didChange) { + return state + } + return replaceWorkflowEntries(state, workflowId, updatedEntries) + }) + }, })) ) diff --git a/apps/sim/stores/terminal/console/types.ts b/apps/sim/stores/terminal/console/types.ts index 8aa342d7a11..09124547494 100644 --- a/apps/sim/stores/terminal/console/types.ts +++ b/apps/sim/stores/terminal/console/types.ts @@ -75,6 +75,7 @@ export interface ConsoleStore { getWorkflowEntries: (workflowId: string) => ConsoleEntry[] toggleConsole: () => void updateConsole: (blockId: string, update: string | ConsoleUpdate, executionId?: string) => void - cancelRunningEntries: (workflowId: string) => void + cancelRunningEntries: (workflowId: string, executionId?: string) => void + finishRunningEntries: (workflowId: string, executionId?: string) => void _hasHydrated: boolean }