From 0f9d1de68a5da067c01718ebc95f2b0ec687decb Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 14:04:36 +0100 Subject: [PATCH 01/13] feat(webapp,run-engine): per-org S2 basin migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move from a single shared basin with a fixed retention to per-org basins with retention tied to the org's billing plan (free 7d / hobby 30d / pro 365d). Stops S2 from deleting streams out from under live chat sessions when basin retention fires before the chat ends, unlocks per-org cost attribution via S2's basin metrics API, and narrows the blast radius of any leaked scoped token. OSS / s2-lite installs are unaffected: provisioning is gated by `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED` and the read precedence falls back to the global basin env var when an entity has no stamped basin. Schema: nullable `streamBasinName` on Organization / TaskRun / Session (migration `20260504071227_add_stream_basin_name`). Stamped at provisioning / trigger / session-create. Reads resolve via `run.streamBasinName ?? session.streamBasinName ?? legacy`. Provisioner: new `streamBasinProvisioner.server.ts` creates basins via S2's `POST /v1/basins`, reconfigures via `PATCH /v1/basins/{name}`, maps plan codes to retention durations. Idempotent on race / pre-existing basin (treats 409 as success). Org create wires it synchronously with soft-fail. Plan changes in `setPlan` enqueue `v3.reconfigureStreamBasinForOrg` next to existing billing-cache invalidations. Worker jobs: `v3.provisionStreamBasinForOrg` (backfill / retry) and `v3.reconfigureStreamBasinForOrg` (plan change) on commonWorker. Read path: `getRealtimeStreamInstance` becomes a factory keyed by `{ run, session }` basin context; stream prefix drops `org/{orgId}` segment for per-org basins (basin already isolates) and keeps it for the legacy basin (orgs share). Access-token cache key includes basin to prevent cross-contamination. Admin routes: POST /admin/api/v1/stream-basins/backfill — fan out provisioning jobs for every org with `streamBasinName: null`. dryRun + limit flags. GET returns progress (`provisioned / totalOrgs`). POST /admin/api/v1/stream-basins/reconfigure — enqueue worker job (queued mode) or run inline with `tier` override (escape hatch). Run-engine: `streamBasinName` added to `TriggerParams` (optional); the V2 trigger path stamps it onto the new TaskRun. No changes to `MinimalAuthenticatedEnvironment` — stamping is a trigger-time concern, not a queue concern. Verified end-to-end with chat.agent locally: backfill creates basins with right retention (7d free), reconfigure flips retention via plan change (30d hobby / 365d pro), chat streams land in the per-org basin, zero leakage to the global fallback basin, multi-turn reuses the same in/out stream pair. --- apps/webapp/app/env.server.ts | 22 ++ apps/webapp/app/models/organization.server.ts | 21 ++ .../admin.api.v1.stream-basins.backfill.ts | 165 ++++++++++ .../admin.api.v1.stream-basins.reconfigure.ts | 63 ++++ ....runs.$runFriendlyId.input-streams.wait.ts | 4 +- ...uns.$runFriendlyId.session-streams.wait.ts | 4 +- apps/webapp/app/routes/api.v1.sessions.ts | 8 + ...ealtime.v1.sessions.$session.$io.append.ts | 4 +- .../realtime.v1.sessions.$session.$io.ts | 8 +- .../realtime.v1.streams.$runId.$streamId.ts | 8 +- ...streams.$runId.$target.$streamId.append.ts | 4 +- ...ime.v1.streams.$runId.$target.$streamId.ts | 46 ++- ...ltime.v1.streams.$runId.input.$streamId.ts | 7 +- .../route.tsx | 3 +- .../runEngine/services/triggerTask.server.ts | 5 + .../webapp/app/services/platform.v3.server.ts | 30 ++ .../realtime/s2realtimeStreams.server.ts | 5 +- .../realtime/streamBasinProvisioner.server.ts | 292 ++++++++++++++++++ .../realtime/v1StreamsGlobal.server.ts | 96 ++++-- apps/webapp/app/v3/commonWorker.server.ts | 59 ++++ .../migration.sql | 8 + .../database/prisma/schema.prisma | 18 ++ .../run-engine/src/engine/index.ts | 5 + .../run-engine/src/engine/types.ts | 5 + 24 files changed, 837 insertions(+), 53 deletions(-) create mode 100644 apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts create mode 100644 apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts create mode 100644 apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts create mode 100644 internal-packages/database/prisma/migrations/20260504071227_add_stream_basin_name/migration.sql diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index ff27168445a..c2b280a7a57 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1506,6 +1506,28 @@ const EnvironmentSchema = z REALTIME_STREAMS_S2_FLUSH_INTERVAL_MS: z.coerce.number().int().default(100), REALTIME_STREAMS_S2_MAX_RETRIES: z.coerce.number().int().default(10), REALTIME_STREAMS_S2_WAIT_SECONDS: z.coerce.number().int().default(60), + /// Per-org basin migration. When "true", the webapp provisions a + /// dedicated S2 basin per org with plan-tied retention and stamps + /// `streamBasinName` on new TaskRun / Session rows. OSS / s2-lite + /// installs leave this off and keep using the single basin defined + /// by `REALTIME_STREAMS_S2_BASIN`. + REALTIME_STREAMS_PER_ORG_BASINS_ENABLED: z.enum(["true", "false"]).default("false"), + /// Naming pattern for per-org basins: `{prefix}-{env}-org-{slug}` + /// e.g. `triggerdotdev-prod-org-acme-corp`. Cluster + tier shorthand + /// — kept short to stay under S2's basin-name length limit. + REALTIME_STREAMS_BASIN_NAME_PREFIX: z.string().default("triggerdotdev"), + REALTIME_STREAMS_BASIN_NAME_ENV: z.string().default("dev"), + /// Plan-tier retention strings (S2 duration syntax: 7d / 30d / 1y). + /// Free / hobby / pro line up with billing tiers; enterprise uses + /// the pro default and is reconfigured per-contract via the API. + REALTIME_STREAMS_BASIN_RETENTION_FREE: z.string().default("7d"), + REALTIME_STREAMS_BASIN_RETENTION_HOBBY: z.string().default("30d"), + REALTIME_STREAMS_BASIN_RETENTION_PRO: z.string().default("365d"), + /// Storage class applied to per-org basins at create time. + REALTIME_STREAMS_BASIN_STORAGE_CLASS: z.enum(["express", "standard"]).default("express"), + /// `delete_on_empty_min_age` applied to per-org basins. Streams + /// that go empty for this long are reaped automatically. + REALTIME_STREAMS_BASIN_DELETE_ON_EMPTY_MIN_AGE: z.string().default("1h"), REALTIME_STREAMS_DEFAULT_VERSION: z.enum(["v1", "v2"]).default("v1"), WAIT_UNTIL_TIMEOUT_MS: z.coerce.number().int().default(600_000), diff --git a/apps/webapp/app/models/organization.server.ts b/apps/webapp/app/models/organization.server.ts index 14315dd337c..b728626b36c 100644 --- a/apps/webapp/app/models/organization.server.ts +++ b/apps/webapp/app/models/organization.server.ts @@ -14,6 +14,8 @@ import { env } from "~/env.server"; import { featuresForUrl } from "~/features.server"; import { createApiKeyForEnv, createPkApiKeyForEnv, envSlug } from "./api-key.server"; import { getDefaultEnvironmentConcurrencyLimit } from "~/services/platform.v3.server"; +import { logger } from "~/services/logger.server"; +import { provisionBasinForOrg } from "~/services/realtime/streamBasinProvisioner.server"; export type { Organization }; const nanoid = customAlphabet("1234567890abcdef", 4); @@ -82,6 +84,25 @@ export async function createOrganization( }, }); + // Provision the org's S2 basin synchronously so the very first run + // gets `streamBasinName` stamped via the existing org read. Soft-fail + // on S2 errors so a transient outage doesn't block signup — the + // backfill reconciler picks up any org left with `streamBasinName: null`. + // No-op when `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false` (OSS mode). + try { + await provisionBasinForOrg({ + id: organization.id, + slug: organization.slug, + tier: "free", // new orgs always start on free retention + streamBasinName: organization.streamBasinName, + }); + } catch (error) { + logger.warn("[createOrganization] streamBasin provisioning failed; backfill will retry", { + orgId: organization.id, + error: error instanceof Error ? error.message : String(error), + }); + } + return { ...organization }; } diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts new file mode 100644 index 00000000000..f35fa842ee0 --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts @@ -0,0 +1,165 @@ +import { json, type ActionFunctionArgs } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { prisma } from "~/db.server"; +import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { isPerOrgBasinsEnabled } from "~/services/realtime/streamBasinProvisioner.server"; +import { commonWorker } from "~/v3/commonWorker.server"; +import { logger } from "~/services/logger.server"; + +/** + * One-shot backfill that enqueues `v3.provisionStreamBasinForOrg` for + * every org with `streamBasinName: null`. Idempotent — re-running picks + * up only the orgs that haven't been provisioned yet, and the worker + * job itself is also idempotent (the provisioner short-circuits if the + * org column is already set). + * + * - Admin auth via `requireAdminApiRequest` (PAT in `Authorization`). + * - Refuses to run when `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false` + * so OSS / s2-lite installs can't accidentally trigger basin + * creation against a misconfigured backend. + * - `dryRun=true` (default false) returns the count without enqueueing. + * - `limit` (default 1000, max 10000) caps a single invocation. Run + * again to process more — the column filter naturally walks the + * queue forward each call. + * - Each job is keyed `provisionStreamBasin:` so concurrent + * backfill calls converge to one job per org instead of duplicating. + * + * Run from a shell: + * curl -X POST -H "Authorization: Bearer $PAT" \ + * "https://api.trigger.dev/admin/api/v1/stream-basins/backfill?limit=200&dryRun=true" + */ + +const BodySchema = z + .object({ + dryRun: z.boolean().optional().default(false), + limit: z.number().int().min(1).max(10_000).optional().default(1000), + }) + .strict(); + +type BackfillResponse = { + ok: true; + dryRun: boolean; + enqueued: number; + pending: number; + remaining: number; + orgIds: string[]; +}; + +export async function action({ request }: ActionFunctionArgs) { + await requireAdminApiRequest(request); + + if (!isPerOrgBasinsEnabled()) { + return json( + { + ok: false, + error: + "Per-org stream basins are disabled. Set REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true before running the backfill.", + }, + { status: 400 } + ); + } + + // `application/json` POST body — empty body falls back to defaults so + // a parameterless POST does the right thing for the default backfill. + let parsed: z.infer; + try { + const text = await request.text(); + const raw = text.length > 0 ? JSON.parse(text) : {}; + const result = BodySchema.safeParse(raw); + if (!result.success) { + return json({ ok: false, error: result.error.flatten() }, { status: 400 }); + } + parsed = result.data; + } catch { + return json({ ok: false, error: "Invalid JSON body" }, { status: 400 }); + } + + const { dryRun, limit } = parsed; + + // Page candidate orgs. Ordered by createdAt so re-runs walk the queue + // forward predictably; deletedAt filter avoids resurrecting orgs. + const candidates = await prisma.organization.findMany({ + where: { + streamBasinName: null, + deletedAt: null, + }, + orderBy: { createdAt: "asc" }, + take: limit, + select: { id: true }, + }); + + // Total count of remaining nulls (for progress reporting). + const remainingTotal = await prisma.organization.count({ + where: { streamBasinName: null, deletedAt: null }, + }); + + if (dryRun) { + const response: BackfillResponse = { + ok: true, + dryRun: true, + enqueued: 0, + pending: candidates.length, + remaining: Math.max(0, remainingTotal - candidates.length), + orgIds: candidates.map((o) => o.id), + }; + return json(response); + } + + // Enqueue one job per org. Per-org dedupe key collapses concurrent + // backfill calls into a single pending job, and a job that's already + // run (basin set) is a no-op on the worker side. + let enqueued = 0; + for (const org of candidates) { + try { + await commonWorker.enqueue({ + job: "v3.provisionStreamBasinForOrg", + payload: { orgId: org.id }, + id: `provisionStreamBasin:${org.id}`, + }); + enqueued += 1; + } catch (error) { + logger.error("[stream-basins-backfill] enqueue failed", { + orgId: org.id, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + const response: BackfillResponse = { + ok: true, + dryRun: false, + enqueued, + pending: candidates.length, + remaining: Math.max(0, remainingTotal - enqueued), + orgIds: candidates.map((o) => o.id), + }; + + logger.info("[stream-basins-backfill] enqueued provisioning jobs", { + enqueued, + candidates: candidates.length, + remaining: response.remaining, + }); + + return json(response); +} + +// GET returns the current state without doing anything — useful for +// monitoring "is the backfill done yet?" from a dashboard / curl. +export async function loader({ request }: ActionFunctionArgs) { + await requireAdminApiRequest(request); + + const totalOrgs = await prisma.organization.count({ where: { deletedAt: null } }); + const provisioned = await prisma.organization.count({ + where: { deletedAt: null, NOT: { streamBasinName: null } }, + }); + const remaining = totalOrgs - provisioned; + + return json({ + ok: true, + perOrgBasinsEnabled: isPerOrgBasinsEnabled(), + totalOrgs, + provisioned, + remaining, + completion: totalOrgs === 0 ? 1 : provisioned / totalOrgs, + }); +} diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts new file mode 100644 index 00000000000..4405213699a --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -0,0 +1,63 @@ +import { json, type ActionFunctionArgs } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { + isPerOrgBasinsEnabled, + reconfigureBasinForOrg, + type StreamBasinTier, +} from "~/services/realtime/streamBasinProvisioner.server"; +import { commonWorker } from "~/v3/commonWorker.server"; + +/** + * Admin trigger for `v3.reconfigureStreamBasinForOrg`. The plan-change + * path in `setPlan` already enqueues this automatically in cloud mode; + * this route exists for ops + e2e testing. + * + * - Default (`{ orgId }`): enqueues the worker job which resolves the + * tier via `getCurrentPlan` and PATCHes the basin to match. No-op + * locally because `getCurrentPlan` is gated to cloud hosts. + * - With `tier`: bypasses the billing lookup and runs reconfigure + * inline against the given tier. Useful for validating the PATCH + * wire shape end-to-end and as a manual override (e.g. enterprise + * contract retention). + */ +const BodySchema = z + .object({ + orgId: z.string(), + tier: z.enum(["free", "hobby", "pro"]).optional(), + }) + .strict(); + +export async function action({ request }: ActionFunctionArgs) { + await requireAdminApiRequest(request); + + if (!isPerOrgBasinsEnabled()) { + return json( + { ok: false, error: "Per-org stream basins are disabled." }, + { status: 400 } + ); + } + + const text = await request.text(); + const parsed = BodySchema.safeParse(JSON.parse(text)); + if (!parsed.success) { + return json({ ok: false, error: parsed.error.flatten() }, { status: 400 }); + } + + if (parsed.data.tier) { + // Direct, synchronous reconfigure with the explicit tier override. + // Skips the worker queue + billing lookup so the PATCH is verifiable + // in the response. Errors surface as 500. + const tier: StreamBasinTier = parsed.data.tier; + await reconfigureBasinForOrg(parsed.data.orgId, tier); + return json({ ok: true, mode: "inline", orgId: parsed.data.orgId, tier }); + } + + await commonWorker.enqueue({ + job: "v3.reconfigureStreamBasinForOrg", + payload: { orgId: parsed.data.orgId }, + id: `reconfigureStreamBasin:${parsed.data.orgId}`, + }); + + return json({ ok: true, mode: "queued", enqueued: parsed.data.orgId }); +} diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.input-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.input-streams.wait.ts index 8e41e9fe4c8..a0f24f9abd8 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.input-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.input-streams.wait.ts @@ -40,6 +40,7 @@ const { action, loader } = createActionApiRoute( id: true, friendlyId: true, realtimeStreamsVersion: true, + streamBasinName: true, }, }); @@ -98,7 +99,8 @@ const { action, loader } = createActionApiRoute( try { const realtimeStream = getRealtimeStreamInstance( authentication.environment, - run.realtimeStreamsVersion + run.realtimeStreamsVersion, + { run } ); const records = await realtimeStream.readRecords( diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index 18034caab47..0b8df65eb5d 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -128,7 +128,9 @@ const { action, loader } = createActionApiRoute( // hardcode "v2", so the race-check reader has to match. // Don't fall through to the run's own `realtimeStreamsVersion`, // which only describes the run's run-scoped streams. - const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { + session: maybeSession, + }); if (realtimeStream instanceof S2RealtimeStreams) { const records = await realtimeStream.readSessionStreamRecords( diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index 38270fdfc77..708d791ff49 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -167,6 +167,10 @@ const { action } = createActionApiRoute( runtimeEnvironmentId: authentication.environment.id, environmentType: authentication.environment.type, organizationId: authentication.environment.organizationId, + // Stamp the org's S2 basin so realtime reads on this + // session's `.in/.out` channels resolve without joining + // Organization. Null until per-org basins are provisioned. + streamBasinName: authentication.environment.organization.streamBasinName, }, update: { triggerConfig: triggerConfigJson }, }); @@ -186,6 +190,10 @@ const { action } = createActionApiRoute( runtimeEnvironmentId: authentication.environment.id, environmentType: authentication.environment.type, organizationId: authentication.environment.organizationId, + // Stamp the org's S2 basin so realtime reads on this + // session's `.in/.out` channels resolve without joining + // Organization. Null until per-org basins are provisioned. + streamBasinName: authentication.environment.organization.streamBasinName, }, }); } diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts index 4251baae91e..45fbde5924b 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts @@ -81,7 +81,9 @@ const { action, loader } = createActionApiRoute( ); } - const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { + session, + }); if (!(realtimeStream instanceof S2RealtimeStreams)) { return json( diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index c04992f7f14..36a30761b01 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -59,7 +59,9 @@ const { action } = createActionApiRoute( }); } - const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { + session: maybeSession, + }); if (!(realtimeStream instanceof S2RealtimeStreams)) { return new Response("Session channels require the S2 realtime backend", { @@ -122,7 +124,9 @@ const loader = createLoaderApiRoute( }, }, async ({ params, request, authentication, resource }) => { - const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { + session: resource.row, + }); if (!(realtimeStream instanceof S2RealtimeStreams)) { return new Response("Session channels require the S2 realtime backend", { diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.$streamId.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.$streamId.ts index aabd83bc9bb..477ce781a20 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.$streamId.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.$streamId.ts @@ -29,6 +29,7 @@ export async function action({ request, params }: ActionFunctionArgs) { select: { id: true, friendlyId: true, + streamBasinName: true, runtimeEnvironment: { include: { project: true, @@ -64,7 +65,9 @@ export async function action({ request, params }: ActionFunctionArgs) { } // The runtimeEnvironment from the run is already in the correct shape for AuthenticatedEnvironment - const realtimeStream = getRealtimeStreamInstance(run.runtimeEnvironment, streamVersion); + const realtimeStream = getRealtimeStreamInstance(run.runtimeEnvironment, streamVersion, { + run, + }); return realtimeStream.ingestData( request.body, @@ -127,7 +130,8 @@ export const loader = createLoaderApiRoute( const realtimeStream = getRealtimeStreamInstance( authentication.environment, - run.realtimeStreamsVersion + run.realtimeStreamsVersion, + { run } ); return realtimeStream.streamResponse(request, run.friendlyId, params.streamId, getRequestAbortSignal(), { diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts index deefbc20773..ec5800c1f9f 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts @@ -72,6 +72,7 @@ const { action } = createActionApiRoute( realtimeStreamsVersion: true, completedAt: true, id: true, + streamBasinName: true, }, }); @@ -102,7 +103,8 @@ const { action } = createActionApiRoute( const realtimeStream = getRealtimeStreamInstance( authentication.environment, - targetRun.realtimeStreamsVersion + targetRun.realtimeStreamsVersion, + { run: targetRun } ); const partId = request.headers.get("X-Part-Id") ?? nanoid(7); diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts index 2a8d07053d9..9ca8e36f4ef 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts @@ -26,14 +26,17 @@ const { action } = createActionApiRoute( select: { id: true, friendlyId: true, + streamBasinName: true, parentTaskRun: { select: { friendlyId: true, + streamBasinName: true, }, }, rootTaskRun: { select: { friendlyId: true, + streamBasinName: true, }, }, }, @@ -43,17 +46,20 @@ const { action } = createActionApiRoute( return new Response("Run not found", { status: 404 }); } - const targetId = + const targetRun = params.target === "self" - ? run.friendlyId + ? run : params.target === "parent" - ? run.parentTaskRun?.friendlyId - : run.rootTaskRun?.friendlyId; + ? run.parentTaskRun + : run.rootTaskRun; - if (!targetId) { + if (!targetRun?.friendlyId) { return new Response("Target not found", { status: 404 }); } + const targetId = targetRun.friendlyId; + const basinContext = { run: { streamBasinName: targetRun.streamBasinName ?? null } }; + if (request.method === "PUT") { // This is the "create" endpoint const updatedRun = await prisma.taskRun.update({ @@ -80,7 +86,8 @@ const { action } = createActionApiRoute( const realtimeStream = getRealtimeStreamInstance( authentication.environment, - updatedRun.realtimeStreamsVersion + updatedRun.realtimeStreamsVersion, + basinContext ); const { responseHeaders } = await realtimeStream.initializeStream(targetId, params.streamId); @@ -112,7 +119,11 @@ const { action } = createActionApiRoute( resumeFromChunkNumber = parsed; } - const realtimeStream = getRealtimeStreamInstance(authentication.environment, streamVersion); + const realtimeStream = getRealtimeStreamInstance( + authentication.environment, + streamVersion, + basinContext + ); return realtimeStream.ingestData( request.body, @@ -139,14 +150,17 @@ const loader = createLoaderApiRoute( select: { id: true, friendlyId: true, + streamBasinName: true, parentTaskRun: { select: { friendlyId: true, + streamBasinName: true, }, }, rootTaskRun: { select: { friendlyId: true, + streamBasinName: true, }, }, }, @@ -158,17 +172,19 @@ const loader = createLoaderApiRoute( return new Response("Run not found", { status: 404 }); } - const targetId = + const targetRun = params.target === "self" - ? run.friendlyId + ? run : params.target === "parent" - ? run.parentTaskRun?.friendlyId - : run.rootTaskRun?.friendlyId; + ? run.parentTaskRun + : run.rootTaskRun; - if (!targetId) { + if (!targetRun?.friendlyId) { return new Response("Target not found", { status: 404 }); } + const targetId = targetRun.friendlyId; + // Handle HEAD request to get last chunk index if (request.method !== "HEAD") { return new Response("Only HEAD requests are allowed for this endpoint", { status: 405 }); @@ -178,7 +194,11 @@ const loader = createLoaderApiRoute( const clientId = request.headers.get("X-Client-Id") || "default"; const streamVersion = request.headers.get("X-Stream-Version") || "v1"; - const realtimeStream = getRealtimeStreamInstance(authentication.environment, streamVersion); + const realtimeStream = getRealtimeStreamInstance( + authentication.environment, + streamVersion, + { run: { streamBasinName: targetRun.streamBasinName ?? null } } + ); const lastChunkIndex = await realtimeStream.getLastChunkIndex( targetId, diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.input.$streamId.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.input.$streamId.ts index b16b1ca7922..089f2dc55e3 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.input.$streamId.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.input.$streamId.ts @@ -46,6 +46,7 @@ const { action } = createActionApiRoute( friendlyId: true, completedAt: true, realtimeStreamsVersion: true, + streamBasinName: true, }, }); @@ -68,7 +69,8 @@ const { action } = createActionApiRoute( const realtimeStream = getRealtimeStreamInstance( authentication.environment, - run.realtimeStreamsVersion + run.realtimeStreamsVersion, + { run } ); // Build the input stream record (raw user data, no wrapper) @@ -155,7 +157,8 @@ const loader = createLoaderApiRoute( const realtimeStream = getRealtimeStreamInstance( authentication.environment, - run.realtimeStreamsVersion + run.realtimeStreamsVersion, + { run } ); // Read from the internal S2 stream name (prefixed to avoid user stream collisions) diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx index 1295adb7842..b6a72d3aa09 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx @@ -87,7 +87,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const realtimeStream = getRealtimeStreamInstance( run.runtimeEnvironment, - run.realtimeStreamsVersion + run.realtimeStreamsVersion, + { run } ); return realtimeStream.streamResponse(request, run.friendlyId, streamKey, getRequestAbortSignal(), { diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts index 610484e67ca..0d58b607b69 100644 --- a/apps/webapp/app/runEngine/services/triggerTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts @@ -395,6 +395,11 @@ export class RunEngineTriggerTaskService { bulkActionId: body.options?.bulkActionId, planType, realtimeStreamsVersion: options.realtimeStreamsVersion, + // Stamp the org's S2 basin onto the new TaskRun so + // realtime read paths can resolve the basin without + // joining `Organization`. Null in OSS / pre-backfill; + // reads then fall back to the global basin env var. + streamBasinName: environment.organization.streamBasinName, debounce: body.options?.debounce, annotations, // When debouncing with triggerAndWait, create a span for the debounced trigger diff --git a/apps/webapp/app/services/platform.v3.server.ts b/apps/webapp/app/services/platform.v3.server.ts index 51075c1b87d..4a9162cc129 100644 --- a/apps/webapp/app/services/platform.v3.server.ts +++ b/apps/webapp/app/services/platform.v3.server.ts @@ -392,6 +392,7 @@ export async function setPlan( // Invalidate billing cache since plan changed opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); + await enqueueStreamBasinReconfigure(organization.id); return redirect(newProjectPath(organization, "You're on the Free plan.")); } else { return redirectWithErrorMessage( @@ -409,17 +410,46 @@ export async function setPlan( // Invalidate billing cache since subscription changed opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); + await enqueueStreamBasinReconfigure(organization.id); return redirectWithSuccessMessage(callerPath, request, "Subscription updated successfully."); } case "canceled_subscription": { // Invalidate billing cache since subscription was canceled opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); + await enqueueStreamBasinReconfigure(organization.id); return redirectWithSuccessMessage(callerPath, request, "Subscription canceled."); } } } +/** + * Best-effort enqueue: when an org's plan changes we want the per-org + * S2 basin's retention to follow (free=7d, hobby=30d, pro=365d). The + * worker job is idempotent and a no-op when per-org basins are disabled + * or the org has no basin yet (OSS / pre-backfill). Failures are + * logged but never block the plan change itself — billing has already + * accepted by the time we reach this code. + */ +async function enqueueStreamBasinReconfigure(orgId: string) { + try { + const { commonWorker } = await import("~/v3/commonWorker.server"); + await commonWorker.enqueue({ + job: "v3.reconfigureStreamBasinForOrg", + payload: { orgId }, + // Per-org dedupe key — concurrent plan changes collapse to one + // pending reconfigure job. The job re-reads the current plan + // when it executes, so the latest tier wins. + id: `reconfigureStreamBasin:${orgId}`, + }); + } catch (error) { + logger.warn("[setPlan] failed to enqueue stream basin reconfigure", { + orgId, + error: error instanceof Error ? error.message : String(error), + }); + } +} + export async function setConcurrencyAddOn(organizationId: string, amount: number) { if (!client) return undefined; diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index 46c7f3854a1..0295d5a58b6 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -464,7 +464,10 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { return this.s2IssueAccessToken(id); } - const result = await this.cache.accessToken.swr(this.streamPrefix, async () => { + // Cache key includes basin so per-org basins never collide on + // cached tokens. `${basin}:${prefix}` is unique per (org-basin, env). + const cacheKey = `${this.basin}:${this.streamPrefix}`; + const result = await this.cache.accessToken.swr(cacheKey, async () => { return this.s2IssueAccessToken(id); }); diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts new file mode 100644 index 00000000000..b115202995b --- /dev/null +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -0,0 +1,292 @@ +/** + * Per-org S2 basin provisioning. + * + * The webapp runs in two modes for realtime stream storage: + * + * - **Single-basin mode** (OSS / s2-lite installs): + * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false`. All orgs share the + * basin in `REALTIME_STREAMS_S2_BASIN`. `Organization.streamBasinName` + * stays null forever; reads / writes resolve to the global basin. + * + * - **Per-org-basin mode** (cloud): + * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true`. Each org gets a + * dedicated basin with retention tied to its billing plan. The + * basin is the unit of cost attribution (S2 exposes per-basin + * metrics) and isolation (access tokens scope to one basin). + * + * Provisioning is one-shot per org: at creation time (or a one-off + * backfill for existing orgs) we create the basin and stamp + * `Organization.streamBasinName`. New `TaskRun` / `Session` rows then + * piggyback on the existing org read in `triggerTask` / session-create + * paths and copy the value through. Reads use a precedence chain + * (`run.streamBasinName ?? session.streamBasinName ?? globalBasin`). + * + * Plan changes update retention in-place via `reconfigureBasin`. We do + * not move data across basins. + */ +import type { PrismaClientOrTransaction } from "~/db.server"; +import { prisma } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; + +/** + * Plan-tier shorthand for retention mapping. Callers translate the + * org's billing plan (via `getCurrentPlan`) into one of these and pass + * it to the provisioner. New orgs (no plan yet) and unbilled orgs + * default to `free` so we don't accidentally grant a year of retention + * to a freeloader. + */ +export type StreamBasinTier = "free" | "hobby" | "pro"; + +export function retentionFor(tier: StreamBasinTier): string { + switch (tier) { + case "pro": + return env.REALTIME_STREAMS_BASIN_RETENTION_PRO; + case "hobby": + return env.REALTIME_STREAMS_BASIN_RETENTION_HOBBY; + case "free": + default: + return env.REALTIME_STREAMS_BASIN_RETENTION_FREE; + } +} + +/** + * Permissive plan-name → tier mapping. Billing returns various strings + * over time (`free_connected`, `hobby`, `team_pro`, `enterprise`, etc.) + * — be forgiving but predictable. + */ +export function planTierFor(planType: string | null | undefined): StreamBasinTier { + if (!planType) return "free"; + const normalized = planType.toLowerCase(); + if (normalized.includes("pro") || normalized.includes("team") || normalized.includes("enterprise")) { + return "pro"; + } + if (normalized.includes("hobby") || normalized.includes("starter")) { + return "hobby"; + } + return "free"; +} + +export function isPerOrgBasinsEnabled(): boolean { + return env.REALTIME_STREAMS_PER_ORG_BASINS_ENABLED === "true"; +} + +/** + * Build the basin name for an org. Format: `{prefix}-{env}-org-{slug}` + * (e.g. `triggerdotdev-prod-org-acme-corp`). The org slug is already + * lowercase-and-hyphenated by `createOrganization`, so it satisfies S2 + * basin-name rules without further normalization. We truncate + * defensively to keep total length under 63 chars (a common bucket + * convention; verify against S2 docs before raising). + */ +export function basinNameForOrg(org: { slug: string }): string { + const prefix = env.REALTIME_STREAMS_BASIN_NAME_PREFIX; + const envName = env.REALTIME_STREAMS_BASIN_NAME_ENV; + const head = `${prefix}-${envName}-org-`; + const budget = 63 - head.length; + const slug = org.slug.slice(0, budget); + return `${head}${slug}`; +} + +type ProvisionInput = { + id: string; + slug: string; + /// Caller decides the tier. Org-create path passes `"free"` for new + /// orgs; the backfill worker resolves the tier via `getCurrentPlan` + /// before calling. Defaults to `"free"` if omitted. + tier?: StreamBasinTier; + streamBasinName: string | null | undefined; +}; + +type ProvisionResult = + | { kind: "skipped"; reason: "feature-disabled" | "already-provisioned"; basin: string | null } + | { kind: "provisioned"; basin: string; retention: string }; + +/** + * Idempotent: if the org already has `streamBasinName`, returns the + * existing value without contacting S2. Otherwise creates the basin + * (S2 returns 409 on race with another caller — we treat that as + * success) and writes the column. + * + * Failure modes: + * - S2 unreachable / 5xx: throws. Callers in the org-create path + * should swallow + enqueue a retry job so signup never fails on a + * transient S2 outage. The backfill worker retries naturally. + * - Auth misconfig (no token): throws. Should never happen in + * per-org-basins mode but worth surfacing loudly. + */ +export async function provisionBasinForOrg( + org: ProvisionInput, + prismaClient: PrismaClientOrTransaction = prisma +): Promise { + if (!isPerOrgBasinsEnabled()) { + return { kind: "skipped", reason: "feature-disabled", basin: null }; + } + + if (org.streamBasinName) { + return { kind: "skipped", reason: "already-provisioned", basin: org.streamBasinName }; + } + + const accessToken = env.REALTIME_STREAMS_S2_ACCESS_TOKEN; + if (!accessToken) { + throw new Error( + "REALTIME_STREAMS_S2_ACCESS_TOKEN must be set when REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true" + ); + } + + const basin = basinNameForOrg(org); + const retention = retentionFor(org.tier ?? "free"); + + await s2CreateBasin(basin, { + accessToken, + retentionPolicy: retention, + storageClass: env.REALTIME_STREAMS_BASIN_STORAGE_CLASS, + deleteOnEmptyMinAge: env.REALTIME_STREAMS_BASIN_DELETE_ON_EMPTY_MIN_AGE, + }); + + await prismaClient.organization.update({ + where: { id: org.id }, + data: { streamBasinName: basin }, + }); + + logger.info("[streamBasinProvisioner] provisioned basin for org", { + orgId: org.id, + basin, + retention, + }); + + return { kind: "provisioned", basin, retention }; +} + +/** + * Update retention after a plan change. Idempotent. No-op when the + * org has no provisioned basin. Caller resolves the tier and passes + * it in — keeps the provisioner ignorant of billing. + */ +export async function reconfigureBasinForOrg( + orgId: string, + tier: StreamBasinTier +): Promise { + if (!isPerOrgBasinsEnabled()) return; + + const accessToken = env.REALTIME_STREAMS_S2_ACCESS_TOKEN; + if (!accessToken) return; + + const org = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { id: true, streamBasinName: true }, + }); + if (!org?.streamBasinName) return; + + const retention = retentionFor(tier); + await s2ReconfigureBasin(org.streamBasinName, { accessToken, retentionPolicy: retention }); + + logger.info("[streamBasinProvisioner] reconfigured basin retention", { + orgId, + basin: org.streamBasinName, + retention, + }); +} + +// ---------- S2 REST ---------- +// +// Account-level API: `POST /v1/basins` to create, `PATCH /v1/basins/{name}` +// to reconfigure. The wire shape uses integer seconds for durations +// (`retention_policy.age`, `delete_on_empty.min_age_secs`) — the human +// strings (`7d`, `30d`, `1y`) are env-var ergonomics that we parse on +// the way out. + +type CreateBasinOptions = { + accessToken: string; + retentionPolicy: string; // e.g. "7d", "30d", "365d" + storageClass: "express" | "standard"; + deleteOnEmptyMinAge: string; // e.g. "1h" +}; + +async function s2CreateBasin(name: string, opts: CreateBasinOptions): Promise { + const url = `https://aws.s2.dev/v1/basins`; + const body = { + basin: name, + config: { + create_stream_on_append: true, + create_stream_on_read: true, + default_stream_config: { + storage_class: opts.storageClass, + retention_policy: { age: durationToSeconds(opts.retentionPolicy) }, + delete_on_empty: { min_age_secs: durationToSeconds(opts.deleteOnEmptyMinAge) }, + }, + }, + }; + + const res = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${opts.accessToken}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + + // 200/201 = created. 409 = basin already exists (race with another + // caller, or a previous run that crashed after S2 ack but before our + // column write committed) — treat as success. + if (res.ok || res.status === 409) return; + + const text = await res.text().catch(() => ""); + throw new Error(`S2 createBasin failed: ${res.status} ${res.statusText} ${text}`); +} + +type ReconfigureBasinOptions = { + accessToken: string; + retentionPolicy: string; +}; + +async function s2ReconfigureBasin(name: string, opts: ReconfigureBasinOptions): Promise { + const url = `https://aws.s2.dev/v1/basins/${encodeURIComponent(name)}`; + const body = { + default_stream_config: { + retention_policy: { age: durationToSeconds(opts.retentionPolicy) }, + }, + }; + + const res = await fetch(url, { + method: "PATCH", + headers: { + Authorization: `Bearer ${opts.accessToken}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + + if (res.ok) return; + + const text = await res.text().catch(() => ""); + throw new Error(`S2 reconfigureBasin failed: ${res.status} ${res.statusText} ${text}`); +} + +/** + * Parse a short duration string (e.g. `7d`, `30d`, `365d`, `1h`, `90m`, + * `45s`, `2w`) into seconds. Tolerant of `7days` and `1week` forms too. + * Throws on garbage so a misconfigured env var fails loudly at first use. + */ +function durationToSeconds(input: string): number { + const trimmed = input.trim().toLowerCase(); + const match = trimmed.match(/^(\d+)\s*(s|sec|secs|seconds?|m|min|mins|minutes?|h|hour|hours?|d|day|days?|w|week|weeks?|y|year|years?)$/); + if (!match) { + throw new Error(`Invalid duration string: ${input}`); + } + const value = parseInt(match[1]!, 10); + const unit = match[2]!; + const multiplier = + /^s/.test(unit) ? 1 + : /^m(?:in|ins|inute|inutes)?$/.test(unit) ? 60 + : /^h/.test(unit) ? 3600 + : /^d/.test(unit) ? 86400 + : /^w/.test(unit) ? 604800 + : /^y/.test(unit) ? 31_536_000 + : NaN; + if (!Number.isFinite(multiplier)) { + throw new Error(`Invalid duration unit: ${unit}`); + } + return value * multiplier; +} diff --git a/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts b/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts index b1bf15b9fed..a43d0e4e444 100644 --- a/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts +++ b/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts @@ -29,41 +29,81 @@ function initializeRedisRealtimeStreams() { export const v1RealtimeStreams = singleton("realtimeStreams", initializeRedisRealtimeStreams); +/** + * Resolve which S2 basin a stream context belongs to. Precedence: + * + * 1. `run.streamBasinName` (set at trigger time, immutable per-run) + * 2. `session.streamBasinName` (set at session create time) + * 3. `REALTIME_STREAMS_S2_BASIN` (the legacy / OSS / pre-backfill global) + * + * Old runs / sessions that pre-date the per-org-basins migration carry + * `null` columns and fall through to the global basin, which is the + * one their streams were originally created in. Once the legacy basin + * drains via S2 retention (~30d on prod today), this fallback can be + * dropped — but it's cheap to keep as a safety net. + * + * OSS / s2-lite installs always hit the global path because the + * provisioner is gated by `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED` + * and `streamBasinName` is never written. + */ +export type StreamBasinContext = { + run?: { streamBasinName: string | null } | null; + session?: { streamBasinName: string | null } | null; +}; + +export function resolveStreamBasin(ctx: StreamBasinContext): string | undefined { + return ( + ctx.run?.streamBasinName ?? + ctx.session?.streamBasinName ?? + env.REALTIME_STREAMS_S2_BASIN ?? + undefined + ); +} + export function getRealtimeStreamInstance( environment: AuthenticatedEnvironment, - streamVersion: string + streamVersion: string, + basinContext?: StreamBasinContext ): StreamIngestor & StreamResponder { if (streamVersion === "v1") { return v1RealtimeStreams; - } else { - if ( - env.REALTIME_STREAMS_S2_BASIN && - (env.REALTIME_STREAMS_S2_ACCESS_TOKEN || - env.REALTIME_STREAMS_S2_SKIP_ACCESS_TOKENS === "true") - ) { - return new S2RealtimeStreams({ - basin: env.REALTIME_STREAMS_S2_BASIN, - accessToken: env.REALTIME_STREAMS_S2_ACCESS_TOKEN ?? "", - endpoint: env.REALTIME_STREAMS_S2_ENDPOINT, - skipAccessTokens: env.REALTIME_STREAMS_S2_SKIP_ACCESS_TOKENS === "true", - streamPrefix: [ - "org", - environment.organization.id, - "env", - environment.slug, - environment.id, - ].join("/"), - logLevel: env.REALTIME_STREAMS_S2_LOG_LEVEL, - flushIntervalMs: env.REALTIME_STREAMS_S2_FLUSH_INTERVAL_MS, - maxRetries: env.REALTIME_STREAMS_S2_MAX_RETRIES, - s2WaitSeconds: env.REALTIME_STREAMS_S2_WAIT_SECONDS, - accessTokenExpirationInMs: env.REALTIME_STREAMS_S2_ACCESS_TOKEN_EXPIRATION_IN_MS, - cache: s2RealtimeStreamsCache, - }); - } + } - throw new Error("Realtime streams v2 is required for this run but S2 configuration is missing"); + const resolvedBasin = resolveStreamBasin(basinContext ?? {}); + if ( + resolvedBasin && + (env.REALTIME_STREAMS_S2_ACCESS_TOKEN || env.REALTIME_STREAMS_S2_SKIP_ACCESS_TOKENS === "true") + ) { + return new S2RealtimeStreams({ + basin: resolvedBasin, + accessToken: env.REALTIME_STREAMS_S2_ACCESS_TOKEN ?? "", + endpoint: env.REALTIME_STREAMS_S2_ENDPOINT, + skipAccessTokens: env.REALTIME_STREAMS_S2_SKIP_ACCESS_TOKENS === "true", + streamPrefix: streamPrefixFor(environment, resolvedBasin), + logLevel: env.REALTIME_STREAMS_S2_LOG_LEVEL, + flushIntervalMs: env.REALTIME_STREAMS_S2_FLUSH_INTERVAL_MS, + maxRetries: env.REALTIME_STREAMS_S2_MAX_RETRIES, + s2WaitSeconds: env.REALTIME_STREAMS_S2_WAIT_SECONDS, + accessTokenExpirationInMs: env.REALTIME_STREAMS_S2_ACCESS_TOKEN_EXPIRATION_IN_MS, + cache: s2RealtimeStreamsCache, + }); } + + throw new Error("Realtime streams v2 is required for this run but S2 configuration is missing"); +} + +/** + * Build the in-basin stream prefix. When the basin is the legacy + * single-basin (OSS / pre-migration), include `org/{orgId}` so streams + * from different orgs are namespaced within the same basin. When the + * basin is per-org, drop the org segment — the basin already isolates. + */ +function streamPrefixFor(environment: AuthenticatedEnvironment, basin: string): string { + const isPerOrgBasin = basin !== env.REALTIME_STREAMS_S2_BASIN; + const segments = isPerOrgBasin + ? ["env", environment.slug, environment.id] + : ["org", environment.organization.id, "env", environment.slug, environment.id]; + return segments.join("/"); } export function determineRealtimeStreamsVersion(streamVersion?: string): "v1" | "v2" { diff --git a/apps/webapp/app/v3/commonWorker.server.ts b/apps/webapp/app/v3/commonWorker.server.ts index a2fae9c73ce..6a87116c948 100644 --- a/apps/webapp/app/v3/commonWorker.server.ts +++ b/apps/webapp/app/v3/commonWorker.server.ts @@ -21,6 +21,13 @@ import { ResumeTaskDependencyService } from "./services/resumeTaskDependency.ser import { RetryAttemptService } from "./services/retryAttempt.server"; import { TimeoutDeploymentService } from "./services/timeoutDeployment.server"; import { BulkActionService } from "./services/bulk/BulkActionV2.server"; +import { + planTierFor, + provisionBasinForOrg, + reconfigureBasinForOrg, +} from "~/services/realtime/streamBasinProvisioner.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; +import { prisma } from "~/db.server"; function initializeWorker() { const redisOptions = { @@ -199,6 +206,24 @@ function initializeWorker() { maxAttempts: 5, }, }, + "v3.provisionStreamBasinForOrg": { + schema: z.object({ + orgId: z.string(), + }), + visibilityTimeoutMs: 60_000, + retry: { + maxAttempts: 5, + }, + }, + "v3.reconfigureStreamBasinForOrg": { + schema: z.object({ + orgId: z.string(), + }), + visibilityTimeoutMs: 60_000, + retry: { + maxAttempts: 5, + }, + }, }, concurrency: { workers: env.COMMON_WORKER_CONCURRENCY_WORKERS, @@ -282,6 +307,40 @@ function initializeWorker() { const service = new BulkActionService(); await service.process(payload.bulkActionId); }, + "v3.provisionStreamBasinForOrg": async ({ payload }) => { + // Backfill / retry path. Reads the org row, resolves the + // billing tier, hands it to the provisioner. The provisioner + // is itself a no-op when per-org basins are disabled or the + // basin is already provisioned. Throws on transient S2 + // failures so redis-worker retries naturally. + const org = await prisma.organization.findFirst({ + where: { id: payload.orgId }, + select: { + id: true, + slug: true, + streamBasinName: true, + }, + }); + if (!org) return; + + const plan = await getCurrentPlan(payload.orgId); + // `plan.code` carries the canonical plan id ("free", "v3_hobby_1", + // "v3_pro_1", "enterprise"). `plan.type` is just the + // billing-shape discriminator ("free" | "paid" | "enterprise") + // and would lump hobby + pro into one bucket. + const tier = planTierFor(plan?.v3Subscription?.plan?.code); + + await provisionBasinForOrg({ ...org, tier }); + }, + "v3.reconfigureStreamBasinForOrg": async ({ payload }) => { + const plan = await getCurrentPlan(payload.orgId); + // `plan.code` carries the canonical plan id ("free", "v3_hobby_1", + // "v3_pro_1", "enterprise"). `plan.type` is just the + // billing-shape discriminator ("free" | "paid" | "enterprise") + // and would lump hobby + pro into one bucket. + const tier = planTierFor(plan?.v3Subscription?.plan?.code); + await reconfigureBasinForOrg(payload.orgId, tier); + }, }, }); diff --git a/internal-packages/database/prisma/migrations/20260504071227_add_stream_basin_name/migration.sql b/internal-packages/database/prisma/migrations/20260504071227_add_stream_basin_name/migration.sql new file mode 100644 index 00000000000..c346d499e76 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260504071227_add_stream_basin_name/migration.sql @@ -0,0 +1,8 @@ +-- AlterTable +ALTER TABLE "public"."Organization" ADD COLUMN IF NOT EXISTS "streamBasinName" TEXT; + +-- AlterTable +ALTER TABLE "public"."Session" ADD COLUMN IF NOT EXISTS "streamBasinName" TEXT; + +-- AlterTable +ALTER TABLE "public"."TaskRun" ADD COLUMN IF NOT EXISTS "streamBasinName" TEXT; diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index c7b5e7ce12b..baf420458b1 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -234,6 +234,13 @@ model Organization { platformNotifications PlatformNotification[] errorGroupStates ErrorGroupState[] + + /// S2 basin that holds this org's realtime streams. Null until the + /// per-org basin has been provisioned (OSS / s2-lite installs leave + /// it null forever; reads fall back to the global basin env var). + /// Set once at provisioning time; retention is reconfigured in-place + /// when the org's plan changes. + streamBasinName String? } model OrgMember { @@ -741,6 +748,12 @@ model Session { createdAt DateTime @default(now()) updatedAt DateTime @updatedAt + /// S2 basin where this session's stream pair lives. Stamped at create + /// time from `Organization.streamBasinName` so reads can resolve the + /// basin without joining org. Null when the org has no per-org basin + /// (OSS, or pre-backfill); reads fall back to the global basin. + streamBasinName String? + runs SessionRun[] /// Idempotency: `(env, externalId)` uniquely identifies a session. @@ -975,6 +988,11 @@ model TaskRun { realtimeStreamsVersion String @default("v1") /// Store the stream keys that are being used by the run realtimeStreams String[] @default([]) + /// S2 basin where this run's realtime streams live. Stamped at create + /// time from `Organization.streamBasinName` so reads can resolve the + /// basin without joining org. Null when the org has no per-org basin + /// (OSS, or pre-backfill); reads fall back to the global basin. + streamBasinName String? @@unique([oneTimeUseToken]) @@unique([runtimeEnvironmentId, taskIdentifier, idempotencyKey]) diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 0da98c3c835..84bb9054654 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -498,6 +498,7 @@ export class RunEngine { bulkActionId, planType, realtimeStreamsVersion, + streamBasinName, debounce, annotations, onDebounced, @@ -660,6 +661,10 @@ export class RunEngine { bulkActionGroupIds: bulkActionId ? [bulkActionId] : undefined, planType, realtimeStreamsVersion, + // Stamp the org's S2 basin so realtime reads resolve + // without joining Organization. Null in OSS / pre-backfill; + // read precedence falls back to the global basin env var. + streamBasinName, debounce: debounce ? { key: debounce.key, diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index 15e63368d2e..251b3c0bd78 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -259,6 +259,11 @@ export type TriggerParams = { bulkActionId?: string; planType?: string; realtimeStreamsVersion?: string; + /// S2 basin where this run's realtime streams live. Stamped onto + /// the new TaskRun row so realtime read paths can resolve the basin + /// without joining `Organization`. Null in OSS / pre-backfill — + /// reads then fall back to the global basin env var. + streamBasinName?: string | null; debounce?: { key: string; delay: string; From a1d456464e53cdac63f4c4f101f6e4f2e1c898c8 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 14:07:25 +0100 Subject: [PATCH 02/13] chore(webapp): server-changes file for per-org basin migration --- .server-changes/per-org-stream-basins.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .server-changes/per-org-stream-basins.md diff --git a/.server-changes/per-org-stream-basins.md b/.server-changes/per-org-stream-basins.md new file mode 100644 index 00000000000..a97a30d80f3 --- /dev/null +++ b/.server-changes/per-org-stream-basins.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Per-org S2 stream basins with plan-tied retention (free 7d / hobby 30d / pro 365d), gated by `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED`. Stops basin retention from deleting streams out from under live chat sessions and unlocks per-org cost attribution via S2 basin metrics. From 692a2576019d99213fb81820c1f3d5adb752a007 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 14:25:58 +0100 Subject: [PATCH 03/13] fix(webapp): address coderabbit review on per-org basin migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - reconfigure admin route: guard `JSON.parse` with try/catch + empty-body check so a malformed POST returns 400 instead of an unhandled 500 (mirror of the backfill route). - session-streams.wait race-check: select `streamBasinName` on the run and pass `{ run, session }` to `getRealtimeStreamInstance` so the resolver picks up the run's stamped basin when the session row is unavailable. - streamBasinProvisioner: 10s `AbortSignal.timeout()` on both `s2CreateBasin` and `s2ReconfigureBasin` so the synchronous org-create path can't hang signup forever on a slow/unresponsive S2. - commonWorker basin handlers: throw when `getCurrentPlan` returns undefined (billing API failure) so redis-worker retries instead of silently defaulting to "free" tier — a reconfigure landing during a transient billing outage would otherwise clip a pro org's retention from 365d to 7d. --- .../admin.api.v1.stream-basins.reconfigure.ts | 10 ++++++-- ...uns.$runFriendlyId.session-streams.wait.ts | 2 ++ .../realtime/streamBasinProvisioner.server.ts | 9 ++++++++ apps/webapp/app/v3/commonWorker.server.ts | 23 +++++++++++++++---- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts index 4405213699a..1ec00454803 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -38,8 +38,14 @@ export async function action({ request }: ActionFunctionArgs) { ); } - const text = await request.text(); - const parsed = BodySchema.safeParse(JSON.parse(text)); + let parsed: ReturnType; + try { + const text = await request.text(); + const raw = text.length > 0 ? JSON.parse(text) : {}; + parsed = BodySchema.safeParse(raw); + } catch { + return json({ ok: false, error: "Invalid JSON body" }, { status: 400 }); + } if (!parsed.success) { return json({ ok: false, error: parsed.error.flatten() }, { status: 400 }); } diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index 0b8df65eb5d..8e2140ad04c 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -47,6 +47,7 @@ const { action, loader } = createActionApiRoute( id: true, friendlyId: true, realtimeStreamsVersion: true, + streamBasinName: true, }, }); @@ -129,6 +130,7 @@ const { action, loader } = createActionApiRoute( // Don't fall through to the run's own `realtimeStreamsVersion`, // which only describes the run's run-scoped streams. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { + run, session: maybeSession, }); diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index b115202995b..47f34ac10eb 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -219,6 +219,11 @@ async function s2CreateBasin(name: string, opts: CreateBasinOptions): Promise { const plan = await getCurrentPlan(payload.orgId); - // `plan.code` carries the canonical plan id ("free", "v3_hobby_1", - // "v3_pro_1", "enterprise"). `plan.type` is just the - // billing-shape discriminator ("free" | "paid" | "enterprise") - // and would lump hobby + pro into one bucket. + // Same guard as provision. A reconfigure that silently resolved + // to "free" would clip a pro org's retention from 365d to 7d + // and prematurely expire history — never acceptable. Throw and + // let the worker retry once billing recovers. + if (plan === undefined) { + throw new Error( + `[reconfigureStreamBasinForOrg] billing plan unavailable for org ${payload.orgId}; will retry` + ); + } const tier = planTierFor(plan?.v3Subscription?.plan?.code); await reconfigureBasinForOrg(payload.orgId, tier); }, From d6d3586f97095236cf847c5a2e0513bb49aacd8e Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 15:06:34 +0100 Subject: [PATCH 04/13] refactor(webapp): drop plan vocabulary from streamBasinProvisioner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The provisioner is now purely retention-string-driven: callers pass a duration like "30d" and it does the S2 round-trip. No tier types, no plan-name matching, no billing imports. The plan-aware mapping moves into a new `streamBasinRetentionByPlan.server.ts` shim that's the only file in the webapp that knows about plan codes. Callers that resolve retention from a plan (the worker's backfill / reconfigure handlers) import the shim; callers that just want a default (the org-create path) call the provisioner without `retention`. Also addresses two review concerns: - `basinNameForOrg` now throws when the configured prefix + env-name leave zero or negative budget for the org slug. Without the guard a too-long prefix would produce `slice(0, 0) = ""` for every org and silently collide their basins via S2's idempotent-create path. - The plan-code → retention mapping uses an exact-match switch instead of substring matching. Substring matching against future plan codes could grant the wrong retention (e.g. `"approved"` matching `"pro"`). The known set is small and explicit; new plan codes go in the switch at launch. Net surface change: - `streamBasinProvisioner.server.ts`: drops `StreamBasinTier`, `planTierFor`, `retentionFor` exports. Adds `defaultRetention()`. `provisionBasinForOrg` takes `{ retention?: string }` instead of `{ tier?: StreamBasinTier }`. `reconfigureBasinForOrg` takes a retention string instead of a tier. - `streamBasinRetentionByPlan.server.ts` (new): exports `resolveRetentionForOrg(orgId)` and `retentionForPlanCode(code)`. - `commonWorker.server.ts`: handlers call the shim, hand a string to the provisioner. - Admin reconfigure route: replaces the `tier` body field with a direct `retention` duration override. - Org create: no longer passes `tier: "free"`; provisioner uses the default. - New env var `REALTIME_STREAMS_BASIN_DEFAULT_RETENTION` (default `30d`). Existing per-plan vars are still consulted by the shim only. Verified end-to-end with chat.agent locally — fresh chat lands in the per-org basin, multi-turn behaves the same, no leakage to the global fallback. --- apps/webapp/app/env.server.ts | 12 +- apps/webapp/app/models/organization.server.ts | 8 +- .../admin.api.v1.stream-basins.reconfigure.ts | 35 +++--- .../realtime/streamBasinProvisioner.server.ts | 111 ++++++++---------- .../streamBasinRetentionByPlan.server.ts | 73 ++++++++++++ apps/webapp/app/v3/commonWorker.server.ts | 51 +++----- 6 files changed, 168 insertions(+), 122 deletions(-) create mode 100644 apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c2b280a7a57..f3840533b52 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1517,9 +1517,15 @@ const EnvironmentSchema = z /// — kept short to stay under S2's basin-name length limit. REALTIME_STREAMS_BASIN_NAME_PREFIX: z.string().default("triggerdotdev"), REALTIME_STREAMS_BASIN_NAME_ENV: z.string().default("dev"), - /// Plan-tier retention strings (S2 duration syntax: 7d / 30d / 1y). - /// Free / hobby / pro line up with billing tiers; enterprise uses - /// the pro default and is reconfigured per-contract via the API. + /// Default retention for new basins (S2 duration syntax: 7d / 30d / 1y). + /// Used at org-create and as the fallback when no plan-specific + /// retention is resolved. Operators that don't run a billing API + /// only need this one. + REALTIME_STREAMS_BASIN_DEFAULT_RETENTION: z.string().default("30d"), + /// Plan-specific retention overrides — only consulted by the + /// optional `streamBasinRetentionByPlan` shim. Operators that + /// don't map plans to retention (OSS, self-hosted) can ignore + /// these and rely on the default above. REALTIME_STREAMS_BASIN_RETENTION_FREE: z.string().default("7d"), REALTIME_STREAMS_BASIN_RETENTION_HOBBY: z.string().default("30d"), REALTIME_STREAMS_BASIN_RETENTION_PRO: z.string().default("365d"), diff --git a/apps/webapp/app/models/organization.server.ts b/apps/webapp/app/models/organization.server.ts index b728626b36c..e9cc4b6ea18 100644 --- a/apps/webapp/app/models/organization.server.ts +++ b/apps/webapp/app/models/organization.server.ts @@ -85,16 +85,18 @@ export async function createOrganization( }); // Provision the org's S2 basin synchronously so the very first run - // gets `streamBasinName` stamped via the existing org read. Soft-fail - // on S2 errors so a transient outage doesn't block signup — the + // gets `streamBasinName` stamped via the existing org read. New orgs + // get the default retention; the plan-change path updates retention + // later if the operator runs a billing-aware install. Soft-fail on + // S2 errors so a transient outage doesn't block signup — the // backfill reconciler picks up any org left with `streamBasinName: null`. // No-op when `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false` (OSS mode). try { await provisionBasinForOrg({ id: organization.id, slug: organization.slug, - tier: "free", // new orgs always start on free retention streamBasinName: organization.streamBasinName, + // No `retention` — provisioner uses `defaultRetention()`. }); } catch (error) { logger.warn("[createOrganization] streamBasin provisioning failed; backfill will retry", { diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts index 1ec00454803..5f274e95410 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -4,27 +4,26 @@ import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; import { isPerOrgBasinsEnabled, reconfigureBasinForOrg, - type StreamBasinTier, } from "~/services/realtime/streamBasinProvisioner.server"; import { commonWorker } from "~/v3/commonWorker.server"; /** * Admin trigger for `v3.reconfigureStreamBasinForOrg`. The plan-change - * path in `setPlan` already enqueues this automatically in cloud mode; + * path in `setPlan` enqueues this automatically when billing is wired; * this route exists for ops + e2e testing. * * - Default (`{ orgId }`): enqueues the worker job which resolves the - * tier via `getCurrentPlan` and PATCHes the basin to match. No-op - * locally because `getCurrentPlan` is gated to cloud hosts. - * - With `tier`: bypasses the billing lookup and runs reconfigure - * inline against the given tier. Useful for validating the PATCH - * wire shape end-to-end and as a manual override (e.g. enterprise - * contract retention). + * retention from the org's plan and PATCHes the basin to match. + * No-op when billing isn't configured (OSS). + * - With `retention`: bypasses the billing lookup and runs reconfigure + * inline against the given duration string (e.g. `"7d"`, `"30d"`, + * `"365d"`, `"1y"`). Useful for validating the PATCH wire shape + * end-to-end and as a manual override (e.g. enterprise contracts). */ const BodySchema = z .object({ orgId: z.string(), - tier: z.enum(["free", "hobby", "pro"]).optional(), + retention: z.string().optional(), }) .strict(); @@ -50,13 +49,17 @@ export async function action({ request }: ActionFunctionArgs) { return json({ ok: false, error: parsed.error.flatten() }, { status: 400 }); } - if (parsed.data.tier) { - // Direct, synchronous reconfigure with the explicit tier override. - // Skips the worker queue + billing lookup so the PATCH is verifiable - // in the response. Errors surface as 500. - const tier: StreamBasinTier = parsed.data.tier; - await reconfigureBasinForOrg(parsed.data.orgId, tier); - return json({ ok: true, mode: "inline", orgId: parsed.data.orgId, tier }); + if (parsed.data.retention) { + // Direct, synchronous reconfigure with the explicit retention. + // Skips the worker queue + billing lookup so the PATCH is + // verifiable in the response. Errors surface as 500. + await reconfigureBasinForOrg(parsed.data.orgId, parsed.data.retention); + return json({ + ok: true, + mode: "inline", + orgId: parsed.data.orgId, + retention: parsed.data.retention, + }); } await commonWorker.enqueue({ diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index 47f34ac10eb..228d1f01a3f 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -8,82 +8,69 @@ * basin in `REALTIME_STREAMS_S2_BASIN`. `Organization.streamBasinName` * stays null forever; reads / writes resolve to the global basin. * - * - **Per-org-basin mode** (cloud): + * - **Per-org-basin mode**: * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true`. Each org gets a - * dedicated basin with retention tied to its billing plan. The - * basin is the unit of cost attribution (S2 exposes per-basin - * metrics) and isolation (access tokens scope to one basin). + * dedicated basin with its own retention. The basin is the unit of + * cost attribution (S2 exposes per-basin metrics) and isolation + * (access tokens scope to one basin). * - * Provisioning is one-shot per org: at creation time (or a one-off - * backfill for existing orgs) we create the basin and stamp + * This module is purely retention-string-driven: callers pass a + * duration like `"30d"` and the provisioner does the S2 round-trip. + * It has no concept of plans / tiers / billing — operators that want + * per-tier retention live one layer up (see + * `streamBasinRetentionByPlan.server.ts`). + * + * Provisioning is one-shot per org: at creation time (or via the + * backfill worker job for existing orgs) we create the basin and stamp * `Organization.streamBasinName`. New `TaskRun` / `Session` rows then * piggyback on the existing org read in `triggerTask` / session-create * paths and copy the value through. Reads use a precedence chain * (`run.streamBasinName ?? session.streamBasinName ?? globalBasin`). * - * Plan changes update retention in-place via `reconfigureBasin`. We do - * not move data across basins. + * Plan / retention changes update retention in-place via + * `reconfigureBasin`. We do not move data across basins. */ import type { PrismaClientOrTransaction } from "~/db.server"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; -/** - * Plan-tier shorthand for retention mapping. Callers translate the - * org's billing plan (via `getCurrentPlan`) into one of these and pass - * it to the provisioner. New orgs (no plan yet) and unbilled orgs - * default to `free` so we don't accidentally grant a year of retention - * to a freeloader. - */ -export type StreamBasinTier = "free" | "hobby" | "pro"; - -export function retentionFor(tier: StreamBasinTier): string { - switch (tier) { - case "pro": - return env.REALTIME_STREAMS_BASIN_RETENTION_PRO; - case "hobby": - return env.REALTIME_STREAMS_BASIN_RETENTION_HOBBY; - case "free": - default: - return env.REALTIME_STREAMS_BASIN_RETENTION_FREE; - } +export function isPerOrgBasinsEnabled(): boolean { + return env.REALTIME_STREAMS_PER_ORG_BASINS_ENABLED === "true"; } /** - * Permissive plan-name → tier mapping. Billing returns various strings - * over time (`free_connected`, `hobby`, `team_pro`, `enterprise`, etc.) - * — be forgiving but predictable. + * Default retention for new orgs and any caller that doesn't specify + * a value. Configurable via `REALTIME_STREAMS_BASIN_DEFAULT_RETENTION`. */ -export function planTierFor(planType: string | null | undefined): StreamBasinTier { - if (!planType) return "free"; - const normalized = planType.toLowerCase(); - if (normalized.includes("pro") || normalized.includes("team") || normalized.includes("enterprise")) { - return "pro"; - } - if (normalized.includes("hobby") || normalized.includes("starter")) { - return "hobby"; - } - return "free"; -} - -export function isPerOrgBasinsEnabled(): boolean { - return env.REALTIME_STREAMS_PER_ORG_BASINS_ENABLED === "true"; +export function defaultRetention(): string { + return env.REALTIME_STREAMS_BASIN_DEFAULT_RETENTION; } /** - * Build the basin name for an org. Format: `{prefix}-{env}-org-{slug}` - * (e.g. `triggerdotdev-prod-org-acme-corp`). The org slug is already - * lowercase-and-hyphenated by `createOrganization`, so it satisfies S2 - * basin-name rules without further normalization. We truncate - * defensively to keep total length under 63 chars (a common bucket - * convention; verify against S2 docs before raising). + * Build the basin name for an org. Format: `{prefix}-{env}-org-{slug}`. + * The org slug is already lowercase-and-hyphenated by + * `createOrganization`, so it satisfies S2 basin-name rules without + * further normalization. We truncate defensively to keep total length + * under 63 chars (a common bucket convention; verify against S2 docs + * before raising). + * + * Throws if `REALTIME_STREAMS_BASIN_NAME_PREFIX` + + * `REALTIME_STREAMS_BASIN_NAME_ENV` are configured so long that no + * room remains for the slug — without this guard, `slice(0, 0)` would + * return an empty string and every org would share the same name, + * silently colliding via S2's 409-on-create. */ export function basinNameForOrg(org: { slug: string }): string { const prefix = env.REALTIME_STREAMS_BASIN_NAME_PREFIX; const envName = env.REALTIME_STREAMS_BASIN_NAME_ENV; const head = `${prefix}-${envName}-org-`; const budget = 63 - head.length; + if (budget <= 0) { + throw new Error( + `[streamBasinProvisioner] REALTIME_STREAMS_BASIN_NAME_PREFIX + REALTIME_STREAMS_BASIN_NAME_ENV too long: head="${head}" leaves no room for the org slug (budget=${budget}). Shorten the prefix or env-name values.` + ); + } const slug = org.slug.slice(0, budget); return `${head}${slug}`; } @@ -91,10 +78,10 @@ export function basinNameForOrg(org: { slug: string }): string { type ProvisionInput = { id: string; slug: string; - /// Caller decides the tier. Org-create path passes `"free"` for new - /// orgs; the backfill worker resolves the tier via `getCurrentPlan` - /// before calling. Defaults to `"free"` if omitted. - tier?: StreamBasinTier; + /// Duration string passed straight to S2. Defaults to + /// `defaultRetention()` when omitted. Caller decides; the provisioner + /// has no opinion about what retention is appropriate. + retention?: string; streamBasinName: string | null | undefined; }; @@ -109,9 +96,9 @@ type ProvisionResult = * success) and writes the column. * * Failure modes: - * - S2 unreachable / 5xx: throws. Callers in the org-create path - * should swallow + enqueue a retry job so signup never fails on a - * transient S2 outage. The backfill worker retries naturally. + * - S2 unreachable / 5xx / timeout: throws. Callers in the org-create + * path swallow + leave the column null so the backfill worker can + * retry, so signup never fails on a transient S2 outage. * - Auth misconfig (no token): throws. Should never happen in * per-org-basins mode but worth surfacing loudly. */ @@ -135,7 +122,7 @@ export async function provisionBasinForOrg( } const basin = basinNameForOrg(org); - const retention = retentionFor(org.tier ?? "free"); + const retention = org.retention ?? defaultRetention(); await s2CreateBasin(basin, { accessToken, @@ -159,13 +146,12 @@ export async function provisionBasinForOrg( } /** - * Update retention after a plan change. Idempotent. No-op when the - * org has no provisioned basin. Caller resolves the tier and passes - * it in — keeps the provisioner ignorant of billing. + * Update retention in-place. Idempotent. No-op when the org has no + * provisioned basin. */ export async function reconfigureBasinForOrg( orgId: string, - tier: StreamBasinTier + retention: string ): Promise { if (!isPerOrgBasinsEnabled()) return; @@ -178,7 +164,6 @@ export async function reconfigureBasinForOrg( }); if (!org?.streamBasinName) return; - const retention = retentionFor(tier); await s2ReconfigureBasin(org.streamBasinName, { accessToken, retentionPolicy: retention }); logger.info("[streamBasinProvisioner] reconfigured basin retention", { diff --git a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts new file mode 100644 index 00000000000..bba2fb7844d --- /dev/null +++ b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts @@ -0,0 +1,73 @@ +/** + * Cloud-flavored shim that resolves a stream-basin retention duration + * from an org's current billing plan. + * + * Kept deliberately separate from `streamBasinProvisioner.server.ts` + * so the provisioner stays purely retention-string-driven and has no + * coupling to plan vocabulary. This file is the only place in the + * webapp that maps "plan code" → "retention duration". + * + * Operators that don't run a billing API just don't call this — the + * provisioner accepts retention strings directly, and the org-create + * path falls back to `defaultRetention()`. + */ +import { env } from "~/env.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; +import { defaultRetention } from "./streamBasinProvisioner.server"; + +/** + * Resolve the retention duration for an org based on its current plan. + * + * - Returns the configured retention for the plan when the billing + * API has data. + * - Returns `defaultRetention()` when no billing client is configured + * (OSS / non-cloud installs that flipped per-org basins on without + * wiring billing). + * - **Throws** when billing is configured but the call failed, so + * the redis-worker retry kicks in and we don't silently downgrade + * a paid org's retention. + */ +export async function resolveRetentionForOrg(orgId: string): Promise { + const plan = await getCurrentPlan(orgId); + + if (plan === undefined) { + // We can't tell from `getCurrentPlan` alone whether the billing + // client isn't configured (OSS) or whether the call failed + // (transient cloud outage). Today we conservatively throw so + // cloud installs retry. OSS installs that hit this path either: + // (a) flipped the per-org-basins flag on without wiring billing + // and should configure `BILLING_API_URL` / `BILLING_API_KEY`, + // or + // (b) shouldn't be calling this at all and should pass an + // explicit retention to the provisioner. + throw new Error( + `[streamBasinRetentionByPlan] billing plan unavailable for org ${orgId}; will retry` + ); + } + + return retentionForPlanCode(plan.v3Subscription?.plan?.code); +} + +/** + * Map a plan code to a retention duration via env-var lookup. + * + * Exact-match against a small known set rather than substring matching, + * since substring matching against future plan codes could grant the + * wrong tier (e.g. `"approved"` would match `"pro"`). Add a new code + * here when launching a new plan. + */ +export function retentionForPlanCode(code: string | null | undefined): string { + if (!code) return defaultRetention(); + + switch (code) { + case "free": + return env.REALTIME_STREAMS_BASIN_RETENTION_FREE; + case "v3_hobby_1": + return env.REALTIME_STREAMS_BASIN_RETENTION_HOBBY; + case "v3_pro_1": + case "enterprise": + return env.REALTIME_STREAMS_BASIN_RETENTION_PRO; + default: + return defaultRetention(); + } +} diff --git a/apps/webapp/app/v3/commonWorker.server.ts b/apps/webapp/app/v3/commonWorker.server.ts index 3737f83213c..dfb0c6df448 100644 --- a/apps/webapp/app/v3/commonWorker.server.ts +++ b/apps/webapp/app/v3/commonWorker.server.ts @@ -22,11 +22,10 @@ import { RetryAttemptService } from "./services/retryAttempt.server"; import { TimeoutDeploymentService } from "./services/timeoutDeployment.server"; import { BulkActionService } from "./services/bulk/BulkActionV2.server"; import { - planTierFor, provisionBasinForOrg, reconfigureBasinForOrg, } from "~/services/realtime/streamBasinProvisioner.server"; -import { getCurrentPlan } from "~/services/platform.v3.server"; +import { resolveRetentionForOrg } from "~/services/realtime/streamBasinRetentionByPlan.server"; import { prisma } from "~/db.server"; function initializeWorker() { @@ -308,11 +307,12 @@ function initializeWorker() { await service.process(payload.bulkActionId); }, "v3.provisionStreamBasinForOrg": async ({ payload }) => { - // Backfill / retry path. Reads the org row, resolves the - // billing tier, hands it to the provisioner. The provisioner - // is itself a no-op when per-org basins are disabled or the - // basin is already provisioned. Throws on transient S2 - // failures so redis-worker retries naturally. + // Backfill / retry path. Resolves the retention for the org + // (cloud installs map plan→retention via the byPlan shim; + // others fall back to the default), then hands a plain + // retention string to the provisioner. The provisioner itself + // has no plan vocabulary. `resolveRetentionForOrg` throws on + // transient billing failure so redis-worker retries naturally. const org = await prisma.organization.findFirst({ where: { id: payload.orgId }, select: { @@ -323,38 +323,15 @@ function initializeWorker() { }); if (!org) return; - const plan = await getCurrentPlan(payload.orgId); - // `plan === undefined` means the billing API call itself failed - // (or the client isn't configured). Throw so redis-worker retries - // — silently defaulting to free would risk a paid org getting - // provisioned with 7d retention if the backfill happened to land - // during a transient billing outage. - if (plan === undefined) { - throw new Error( - `[provisionStreamBasinForOrg] billing plan unavailable for org ${payload.orgId}; will retry` - ); - } - // `plan.code` carries the canonical plan id ("free", "v3_hobby_1", - // "v3_pro_1", "enterprise"). `plan.type` is just the - // billing-shape discriminator ("free" | "paid" | "enterprise") - // and would lump hobby + pro into one bucket. - const tier = planTierFor(plan?.v3Subscription?.plan?.code); - - await provisionBasinForOrg({ ...org, tier }); + const retention = await resolveRetentionForOrg(payload.orgId); + await provisionBasinForOrg({ ...org, retention }); }, "v3.reconfigureStreamBasinForOrg": async ({ payload }) => { - const plan = await getCurrentPlan(payload.orgId); - // Same guard as provision. A reconfigure that silently resolved - // to "free" would clip a pro org's retention from 365d to 7d - // and prematurely expire history — never acceptable. Throw and - // let the worker retry once billing recovers. - if (plan === undefined) { - throw new Error( - `[reconfigureStreamBasinForOrg] billing plan unavailable for org ${payload.orgId}; will retry` - ); - } - const tier = planTierFor(plan?.v3Subscription?.plan?.code); - await reconfigureBasinForOrg(payload.orgId, tier); + // Same shape as provision: resolve retention up front, hand a + // plain string to the provisioner. The shim throws on billing + // failure rather than silently downgrading retention. + const retention = await resolveRetentionForOrg(payload.orgId); + await reconfigureBasinForOrg(payload.orgId, retention); }, }, }); From 054d1afcdb8260e0951d57007d8f4b91a78cb27c Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 15:53:25 +0100 Subject: [PATCH 05/13] fix(webapp): address review on per-org basin migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use `org.id` (cuid, fixed-length, unique-by-construction) as the basin-name suffix instead of a truncated `org.slug`. The slug approach could silently collide two orgs whose slugs share a prefix past the truncation point, since the create call treats S2's 409 as success — a real cross-tenant isolation risk. - `resolveRetentionForOrg` now distinguishes "billing not configured" from "billing call failed". OSS / self-hosted installs (no billing client) get `defaultRetention()` and the worker job converges; cloud installs that experience a transient billing failure throw and get retried by redis-worker. Previously every install without billing hit a permafail loop. - `reconfigureBasinForOrg` throws when no S2 access token is configured instead of silently returning, so a misconfigured cloud install surfaces as a worker failure rather than stale retention. - Duration env vars (`*_RETENTION*`, `*_DELETE_ON_EMPTY_MIN_AGE`) validated at boot via a `durationString()` Zod schema, so a misconfigured value fails fast at startup instead of at first basin operation. - Admin reconfigure route's `retention` body field validated against the same duration shape — bad input is now a clean 400 rather than a 500 from `parseDuration`. - Extract duration parsing into a shared `duration.server.ts` so the env validator and the provisioner share one source of truth. Verified end-to-end with chat.agent locally — fresh chat lands in the per-org basin, no leakage to the global fallback. --- apps/webapp/app/env.server.ts | 23 ++++-- apps/webapp/app/models/organization.server.ts | 1 - .../admin.api.v1.stream-basins.reconfigure.ts | 6 +- .../webapp/app/services/platform.v3.server.ts | 11 +++ .../app/services/realtime/duration.server.ts | 49 ++++++++++++ .../realtime/streamBasinProvisioner.server.ts | 79 +++++++------------ .../streamBasinRetentionByPlan.server.ts | 40 +++++----- apps/webapp/app/v3/commonWorker.server.ts | 1 - 8 files changed, 131 insertions(+), 79 deletions(-) create mode 100644 apps/webapp/app/services/realtime/duration.server.ts diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index f3840533b52..4518295e14b 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -3,6 +3,19 @@ import { MachinePresetName } from "@trigger.dev/core/v3"; import { BoolEnv } from "./utils/boolEnv"; import { isValidDatabaseUrl } from "./utils/db"; import { isValidRegex } from "./utils/regex"; +import { isValidDuration } from "./services/realtime/duration.server"; + +/** + * `z.string()` constrained to a duration string parseable by + * `parseDuration` (e.g. `7d`, `30d`, `365d`, `1h`). Validated at boot + * so a typo'd retention env var fails fast at startup rather than + * lurking until the first basin operation. + */ +function durationString() { + return z + .string() + .refine(isValidDuration, "must be a duration like 7d, 30d, 365d, 1h, 1y"); +} // Parses a CSV of machine preset names (e.g. "small-1x,small-2x") into a // non-empty array of MachinePresetName. Used by COMPUTE_TEMPLATE_MACHINE_PRESETS @@ -1521,19 +1534,19 @@ const EnvironmentSchema = z /// Used at org-create and as the fallback when no plan-specific /// retention is resolved. Operators that don't run a billing API /// only need this one. - REALTIME_STREAMS_BASIN_DEFAULT_RETENTION: z.string().default("30d"), + REALTIME_STREAMS_BASIN_DEFAULT_RETENTION: durationString().default("30d"), /// Plan-specific retention overrides — only consulted by the /// optional `streamBasinRetentionByPlan` shim. Operators that /// don't map plans to retention (OSS, self-hosted) can ignore /// these and rely on the default above. - REALTIME_STREAMS_BASIN_RETENTION_FREE: z.string().default("7d"), - REALTIME_STREAMS_BASIN_RETENTION_HOBBY: z.string().default("30d"), - REALTIME_STREAMS_BASIN_RETENTION_PRO: z.string().default("365d"), + REALTIME_STREAMS_BASIN_RETENTION_FREE: durationString().default("7d"), + REALTIME_STREAMS_BASIN_RETENTION_HOBBY: durationString().default("30d"), + REALTIME_STREAMS_BASIN_RETENTION_PRO: durationString().default("365d"), /// Storage class applied to per-org basins at create time. REALTIME_STREAMS_BASIN_STORAGE_CLASS: z.enum(["express", "standard"]).default("express"), /// `delete_on_empty_min_age` applied to per-org basins. Streams /// that go empty for this long are reaped automatically. - REALTIME_STREAMS_BASIN_DELETE_ON_EMPTY_MIN_AGE: z.string().default("1h"), + REALTIME_STREAMS_BASIN_DELETE_ON_EMPTY_MIN_AGE: durationString().default("1h"), REALTIME_STREAMS_DEFAULT_VERSION: z.enum(["v1", "v2"]).default("v1"), WAIT_UNTIL_TIMEOUT_MS: z.coerce.number().int().default(600_000), diff --git a/apps/webapp/app/models/organization.server.ts b/apps/webapp/app/models/organization.server.ts index e9cc4b6ea18..2a2a7f24554 100644 --- a/apps/webapp/app/models/organization.server.ts +++ b/apps/webapp/app/models/organization.server.ts @@ -94,7 +94,6 @@ export async function createOrganization( try { await provisionBasinForOrg({ id: organization.id, - slug: organization.slug, streamBasinName: organization.streamBasinName, // No `retention` — provisioner uses `defaultRetention()`. }); diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts index 5f274e95410..71eec78f03a 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -1,6 +1,7 @@ import { json, type ActionFunctionArgs } from "@remix-run/server-runtime"; import { z } from "zod"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { isValidDuration } from "~/services/realtime/duration.server"; import { isPerOrgBasinsEnabled, reconfigureBasinForOrg, @@ -23,7 +24,10 @@ import { commonWorker } from "~/v3/commonWorker.server"; const BodySchema = z .object({ orgId: z.string(), - retention: z.string().optional(), + retention: z + .string() + .refine(isValidDuration, "retention must be a duration like 7d, 30d, 365d, 1h, 1y") + .optional(), }) .strict(); diff --git a/apps/webapp/app/services/platform.v3.server.ts b/apps/webapp/app/services/platform.v3.server.ts index 4a9162cc129..eb286a97535 100644 --- a/apps/webapp/app/services/platform.v3.server.ts +++ b/apps/webapp/app/services/platform.v3.server.ts @@ -44,6 +44,17 @@ function initializeClient() { } const client = singleton("billingClient", initializeClient); + +/** + * `true` when the billing client was instantiated — i.e. we're running + * in a cloud-style install with `BILLING_API_URL` + `BILLING_API_KEY` + * configured. OSS / self-hosted installs return `false` here, which + * lets callers distinguish "no billing wired up, fall back to + * defaults" from "billing wired up but the call failed, retry." + */ +export function isBillingConfigured(): boolean { + return client !== undefined; +} // Failures from @trigger.dev/platform billing client calls are tracked via // this metric (with low-cardinality {function, kind} labels) rather than // logged. Every task invocation hits these paths, so per-call logs were too diff --git a/apps/webapp/app/services/realtime/duration.server.ts b/apps/webapp/app/services/realtime/duration.server.ts new file mode 100644 index 00000000000..c6aab9eb9df --- /dev/null +++ b/apps/webapp/app/services/realtime/duration.server.ts @@ -0,0 +1,49 @@ +/** + * Duration string parsing for stream-basin retention / delete-on-empty + * configuration. Used by `streamBasinProvisioner` (to convert to S2's + * integer-seconds wire format) and by `env.server.ts` (to validate + * duration-shaped env vars at boot rather than at first use). + * + * Accepts the short forms (`7d`, `30d`, `365d`, `1h`, `90m`, `45s`, + * `2w`, `1y`) and the human forms (`7days`, `1week`, `1year`). + */ + +const PATTERN = + /^(\d+)\s*(s|sec|secs|seconds?|m|min|mins|minutes?|h|hour|hours?|d|day|days?|w|week|weeks?|y|year|years?)$/; + +export function isValidDuration(input: string): boolean { + return PATTERN.test(input.trim().toLowerCase()); +} + +/** + * Parse a duration string into seconds. Throws on garbage so a + * misconfigured env var fails loudly. Use {@link isValidDuration} + * for non-throwing validation (e.g. inside a Zod `.refine()`). + */ +export function parseDuration(input: string): number { + const trimmed = input.trim().toLowerCase(); + const match = trimmed.match(PATTERN); + if (!match) { + throw new Error(`Invalid duration string: ${input}`); + } + const value = parseInt(match[1]!, 10); + const unit = match[2]!; + const multiplier = + /^s/.test(unit) + ? 1 + : /^m(?:in|ins|inute|inutes)?$/.test(unit) + ? 60 + : /^h/.test(unit) + ? 3600 + : /^d/.test(unit) + ? 86400 + : /^w/.test(unit) + ? 604800 + : /^y/.test(unit) + ? 31_536_000 + : NaN; + if (!Number.isFinite(multiplier)) { + throw new Error(`Invalid duration unit: ${unit}`); + } + return value * multiplier; +} diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index 228d1f01a3f..42eb7cdf1aa 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -34,6 +34,7 @@ import type { PrismaClientOrTransaction } from "~/db.server"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; +import { parseDuration } from "./duration.server"; export function isPerOrgBasinsEnabled(): boolean { return env.REALTIME_STREAMS_PER_ORG_BASINS_ENABLED === "true"; @@ -48,36 +49,28 @@ export function defaultRetention(): string { } /** - * Build the basin name for an org. Format: `{prefix}-{env}-org-{slug}`. - * The org slug is already lowercase-and-hyphenated by - * `createOrganization`, so it satisfies S2 basin-name rules without - * further normalization. We truncate defensively to keep total length - * under 63 chars (a common bucket convention; verify against S2 docs - * before raising). + * Build the basin name for an org. Format: `{prefix}-{env}-org-{id}`. * - * Throws if `REALTIME_STREAMS_BASIN_NAME_PREFIX` + - * `REALTIME_STREAMS_BASIN_NAME_ENV` are configured so long that no - * room remains for the slug — without this guard, `slice(0, 0)` would - * return an empty string and every org would share the same name, - * silently colliding via S2's 409-on-create. + * We use the org's `id` (cuid, fixed-length, unique-by-construction) + * rather than the slug. Slugs are user-influenced, can change, and — + * critically — could collide across orgs once truncated to fit the + * S2 basin-name length cap. cuid is short (25 chars) and never + * collides, so the basin name is stable and tenant-isolated by + * construction. + * + * Format check: `triggerdotdev-prod-org-{25 chars}` is 47 chars total, + * comfortably under the conventional 63-char cap. If you change the + * prefix / env-name to something extreme, this still fails fast at + * S2's validator. */ -export function basinNameForOrg(org: { slug: string }): string { +export function basinNameForOrg(org: { id: string }): string { const prefix = env.REALTIME_STREAMS_BASIN_NAME_PREFIX; const envName = env.REALTIME_STREAMS_BASIN_NAME_ENV; - const head = `${prefix}-${envName}-org-`; - const budget = 63 - head.length; - if (budget <= 0) { - throw new Error( - `[streamBasinProvisioner] REALTIME_STREAMS_BASIN_NAME_PREFIX + REALTIME_STREAMS_BASIN_NAME_ENV too long: head="${head}" leaves no room for the org slug (budget=${budget}). Shorten the prefix or env-name values.` - ); - } - const slug = org.slug.slice(0, budget); - return `${head}${slug}`; + return `${prefix}-${envName}-org-${org.id}`; } type ProvisionInput = { id: string; - slug: string; /// Duration string passed straight to S2. Defaults to /// `defaultRetention()` when omitted. Caller decides; the provisioner /// has no opinion about what retention is appropriate. @@ -156,7 +149,15 @@ export async function reconfigureBasinForOrg( if (!isPerOrgBasinsEnabled()) return; const accessToken = env.REALTIME_STREAMS_S2_ACCESS_TOKEN; - if (!accessToken) return; + if (!accessToken) { + // Per-org basins are enabled but no token is configured — that's a + // misconfiguration, not a no-op condition. Throw so the worker job + // surfaces in the queue's failure log instead of silently leaving + // retention stale on the basin. + throw new Error( + "REALTIME_STREAMS_S2_ACCESS_TOKEN must be set when REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true" + ); + } const org = await prisma.organization.findFirst({ where: { id: orgId }, @@ -197,8 +198,8 @@ async function s2CreateBasin(name: string, opts: CreateBasinOptions): Promise { - const plan = await getCurrentPlan(orgId); + if (!isBillingConfigured()) { + // No billing wired up — operator either runs OSS or hasn't set + // BILLING_API_URL / BILLING_API_KEY. Fall back to the default; + // the org-create path uses the same default, so this is just the + // backfill's catch-up path arriving at the same answer. + return defaultRetention(); + } + const plan = await getCurrentPlan(orgId); if (plan === undefined) { - // We can't tell from `getCurrentPlan` alone whether the billing - // client isn't configured (OSS) or whether the call failed - // (transient cloud outage). Today we conservatively throw so - // cloud installs retry. OSS installs that hit this path either: - // (a) flipped the per-org-basins flag on without wiring billing - // and should configure `BILLING_API_URL` / `BILLING_API_KEY`, - // or - // (b) shouldn't be calling this at all and should pass an - // explicit retention to the provisioner. + // Billing client exists but the call failed. Throw so redis-worker + // retries — silently defaulting to free would clip a paid org's + // retention if a backfill landed during a transient billing outage. throw new Error( `[streamBasinRetentionByPlan] billing plan unavailable for org ${orgId}; will retry` ); diff --git a/apps/webapp/app/v3/commonWorker.server.ts b/apps/webapp/app/v3/commonWorker.server.ts index dfb0c6df448..f98e3c834be 100644 --- a/apps/webapp/app/v3/commonWorker.server.ts +++ b/apps/webapp/app/v3/commonWorker.server.ts @@ -317,7 +317,6 @@ function initializeWorker() { where: { id: payload.orgId }, select: { id: true, - slug: true, streamBasinName: true, }, }); From fc8801707ef67e79ff27a34def48ee36fbc27de0 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 17:11:29 +0100 Subject: [PATCH 06/13] refactor(webapp): per-org basins for paid orgs only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Free orgs share the global stream basin (the existing legacy fallback path); paid orgs get a dedicated per-org basin with retention tied to their tier. Cleaner story, much smaller S2 footprint, and basin existence becomes a real tier benefit rather than a default for everyone. A single `v3.reconcileStreamBasinForOrg` worker job handles every plan transition idempotently: free → paid: provision a new basin, stamp `Organization.streamBasinName`. paid → paid: reconfigure retention (tier-change). S2 retention only takes effect on new streams, but that's fine — old streams age out on their original retention. paid → free: null `Organization.streamBasinName`. Future runs/sessions for this org route through the shared global basin via the existing read-precedence fallback. The per-org basin lingers; existing streams there respect their original retention until they age out. free → free: no-op. Replaces the previous `provisionStreamBasinForOrg` / `reconfigureStreamBasinForOrg` job pair so callers don't have to choose the right job for the transition. `setPlan` enqueues `reconcile` from all three plan-changed branches; the admin backfill route enqueues `reconcile` for every non-deleted org (idempotent — the worker decides per-org what to do). Org create no longer provisions synchronously — new orgs start free and use the shared basin until their first paid upgrade. Verified locally: backfill correctly deprovisioned 4 free orgs (column nulled, basins left intact) and kept the 1 hobby-tier org's basin. A fresh chat for a free org streams into the shared basin under the legacy prefix `org/{orgId}/env/.../sessions/{chatId}/{io}` with no new streams in the old per-org basin. --- apps/webapp/app/models/organization.server.ts | 22 --- .../admin.api.v1.stream-basins.backfill.ts | 63 +++----- .../admin.api.v1.stream-basins.reconfigure.ts | 4 +- .../webapp/app/services/platform.v3.server.ts | 39 +++-- .../streamBasinRetentionByPlan.server.ts | 153 +++++++++++++----- apps/webapp/app/v3/commonWorker.server.ts | 50 ++---- 6 files changed, 177 insertions(+), 154 deletions(-) diff --git a/apps/webapp/app/models/organization.server.ts b/apps/webapp/app/models/organization.server.ts index 2a2a7f24554..14315dd337c 100644 --- a/apps/webapp/app/models/organization.server.ts +++ b/apps/webapp/app/models/organization.server.ts @@ -14,8 +14,6 @@ import { env } from "~/env.server"; import { featuresForUrl } from "~/features.server"; import { createApiKeyForEnv, createPkApiKeyForEnv, envSlug } from "./api-key.server"; import { getDefaultEnvironmentConcurrencyLimit } from "~/services/platform.v3.server"; -import { logger } from "~/services/logger.server"; -import { provisionBasinForOrg } from "~/services/realtime/streamBasinProvisioner.server"; export type { Organization }; const nanoid = customAlphabet("1234567890abcdef", 4); @@ -84,26 +82,6 @@ export async function createOrganization( }, }); - // Provision the org's S2 basin synchronously so the very first run - // gets `streamBasinName` stamped via the existing org read. New orgs - // get the default retention; the plan-change path updates retention - // later if the operator runs a billing-aware install. Soft-fail on - // S2 errors so a transient outage doesn't block signup — the - // backfill reconciler picks up any org left with `streamBasinName: null`. - // No-op when `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false` (OSS mode). - try { - await provisionBasinForOrg({ - id: organization.id, - streamBasinName: organization.streamBasinName, - // No `retention` — provisioner uses `defaultRetention()`. - }); - } catch (error) { - logger.warn("[createOrganization] streamBasin provisioning failed; backfill will retry", { - orgId: organization.id, - error: error instanceof Error ? error.message : String(error), - }); - } - return { ...organization }; } diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts index f35fa842ee0..991548f8332 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts @@ -7,26 +7,22 @@ import { commonWorker } from "~/v3/commonWorker.server"; import { logger } from "~/services/logger.server"; /** - * One-shot backfill that enqueues `v3.provisionStreamBasinForOrg` for - * every org with `streamBasinName: null`. Idempotent — re-running picks - * up only the orgs that haven't been provisioned yet, and the worker - * job itself is also idempotent (the provisioner short-circuits if the - * org column is already set). + * One-shot backfill that enqueues `v3.reconcileStreamBasinForOrg` for + * every non-deleted org. The reconciler decides per-org what to do: + * provision a basin for paid orgs that don't have one, reconfigure + * retention for paid orgs whose tier changed, deprovision (null the + * column) for free orgs that were mistakenly provisioned. Idempotent + * — re-running converges to the desired state. * * - Admin auth via `requireAdminApiRequest` (PAT in `Authorization`). * - Refuses to run when `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false` * so OSS / s2-lite installs can't accidentally trigger basin - * creation against a misconfigured backend. + * operations against a misconfigured backend. * - `dryRun=true` (default false) returns the count without enqueueing. * - `limit` (default 1000, max 10000) caps a single invocation. Run - * again to process more — the column filter naturally walks the - * queue forward each call. - * - Each job is keyed `provisionStreamBasin:` so concurrent - * backfill calls converge to one job per org instead of duplicating. - * - * Run from a shell: - * curl -X POST -H "Authorization: Bearer $PAT" \ - * "https://api.trigger.dev/admin/api/v1/stream-basins/backfill?limit=200&dryRun=true" + * again with the next batch. + * - Each job is keyed `reconcileStreamBasin:` so concurrent + * calls converge to one job per org. */ const BodySchema = z @@ -59,8 +55,6 @@ export async function action({ request }: ActionFunctionArgs) { ); } - // `application/json` POST body — empty body falls back to defaults so - // a parameterless POST does the right thing for the default backfill. let parsed: z.infer; try { const text = await request.text(); @@ -76,21 +70,19 @@ export async function action({ request }: ActionFunctionArgs) { const { dryRun, limit } = parsed; - // Page candidate orgs. Ordered by createdAt so re-runs walk the queue - // forward predictably; deletedAt filter avoids resurrecting orgs. + // Walk every non-deleted org. The reconcile worker is fast for the + // no-op case (free with null column) so enqueueing for all is fine + // — saves us from doing per-org billing lookups here just to filter + // candidates. const candidates = await prisma.organization.findMany({ - where: { - streamBasinName: null, - deletedAt: null, - }, + where: { deletedAt: null }, orderBy: { createdAt: "asc" }, take: limit, select: { id: true }, }); - // Total count of remaining nulls (for progress reporting). - const remainingTotal = await prisma.organization.count({ - where: { streamBasinName: null, deletedAt: null }, + const totalOrgs = await prisma.organization.count({ + where: { deletedAt: null }, }); if (dryRun) { @@ -99,22 +91,19 @@ export async function action({ request }: ActionFunctionArgs) { dryRun: true, enqueued: 0, pending: candidates.length, - remaining: Math.max(0, remainingTotal - candidates.length), + remaining: Math.max(0, totalOrgs - candidates.length), orgIds: candidates.map((o) => o.id), }; return json(response); } - // Enqueue one job per org. Per-org dedupe key collapses concurrent - // backfill calls into a single pending job, and a job that's already - // run (basin set) is a no-op on the worker side. let enqueued = 0; for (const org of candidates) { try { await commonWorker.enqueue({ - job: "v3.provisionStreamBasinForOrg", + job: "v3.reconcileStreamBasinForOrg", payload: { orgId: org.id }, - id: `provisionStreamBasin:${org.id}`, + id: `reconcileStreamBasin:${org.id}`, }); enqueued += 1; } catch (error) { @@ -130,11 +119,11 @@ export async function action({ request }: ActionFunctionArgs) { dryRun: false, enqueued, pending: candidates.length, - remaining: Math.max(0, remainingTotal - enqueued), + remaining: Math.max(0, totalOrgs - enqueued), orgIds: candidates.map((o) => o.id), }; - logger.info("[stream-basins-backfill] enqueued provisioning jobs", { + logger.info("[stream-basins-backfill] enqueued reconcile jobs", { enqueued, candidates: candidates.length, remaining: response.remaining, @@ -149,17 +138,15 @@ export async function loader({ request }: ActionFunctionArgs) { await requireAdminApiRequest(request); const totalOrgs = await prisma.organization.count({ where: { deletedAt: null } }); - const provisioned = await prisma.organization.count({ + const withBasin = await prisma.organization.count({ where: { deletedAt: null, NOT: { streamBasinName: null } }, }); - const remaining = totalOrgs - provisioned; return json({ ok: true, perOrgBasinsEnabled: isPerOrgBasinsEnabled(), totalOrgs, - provisioned, - remaining, - completion: totalOrgs === 0 ? 1 : provisioned / totalOrgs, + withBasin, + withoutBasin: totalOrgs - withBasin, }); } diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts index 71eec78f03a..866045bbe47 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -67,9 +67,9 @@ export async function action({ request }: ActionFunctionArgs) { } await commonWorker.enqueue({ - job: "v3.reconfigureStreamBasinForOrg", + job: "v3.reconcileStreamBasinForOrg", payload: { orgId: parsed.data.orgId }, - id: `reconfigureStreamBasin:${parsed.data.orgId}`, + id: `reconcileStreamBasin:${parsed.data.orgId}`, }); return json({ ok: true, mode: "queued", enqueued: parsed.data.orgId }); diff --git a/apps/webapp/app/services/platform.v3.server.ts b/apps/webapp/app/services/platform.v3.server.ts index eb286a97535..1e312c5e7ae 100644 --- a/apps/webapp/app/services/platform.v3.server.ts +++ b/apps/webapp/app/services/platform.v3.server.ts @@ -403,7 +403,7 @@ export async function setPlan( // Invalidate billing cache since plan changed opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); - await enqueueStreamBasinReconfigure(organization.id); + await enqueueStreamBasinReconcile(organization.id); return redirect(newProjectPath(organization, "You're on the Free plan.")); } else { return redirectWithErrorMessage( @@ -421,40 +421,49 @@ export async function setPlan( // Invalidate billing cache since subscription changed opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); - await enqueueStreamBasinReconfigure(organization.id); + await enqueueStreamBasinReconcile(organization.id); return redirectWithSuccessMessage(callerPath, request, "Subscription updated successfully."); } case "canceled_subscription": { // Invalidate billing cache since subscription was canceled opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); - await enqueueStreamBasinReconfigure(organization.id); + await enqueueStreamBasinReconcile(organization.id); return redirectWithSuccessMessage(callerPath, request, "Subscription canceled."); } } } /** - * Best-effort enqueue: when an org's plan changes we want the per-org - * S2 basin's retention to follow (free=7d, hobby=30d, pro=365d). The - * worker job is idempotent and a no-op when per-org basins are disabled - * or the org has no basin yet (OSS / pre-backfill). Failures are - * logged but never block the plan change itself — billing has already - * accepted by the time we reach this code. + * Best-effort enqueue: when an org's plan changes we reconcile its + * stream-basin state. The reconciler handles every transition: + * + * free → paid: provision a dedicated basin with the plan's retention. + * paid → paid: reconfigure the existing basin's retention. + * paid → free: null `Organization.streamBasinName`. Future runs/sessions + * flow to the shared global basin; the per-org basin + * lingers until existing streams age out on their original + * retention. + * free → free: no-op. + * + * Idempotent and a no-op when per-org basins are disabled or billing + * isn't configured. Failures are logged but never block the plan + * change itself — billing has already accepted by the time we reach + * this code. */ -async function enqueueStreamBasinReconfigure(orgId: string) { +async function enqueueStreamBasinReconcile(orgId: string) { try { const { commonWorker } = await import("~/v3/commonWorker.server"); await commonWorker.enqueue({ - job: "v3.reconfigureStreamBasinForOrg", + job: "v3.reconcileStreamBasinForOrg", payload: { orgId }, // Per-org dedupe key — concurrent plan changes collapse to one - // pending reconfigure job. The job re-reads the current plan - // when it executes, so the latest tier wins. - id: `reconfigureStreamBasin:${orgId}`, + // pending reconcile job. The job re-reads the current plan when + // it executes, so the latest tier wins. + id: `reconcileStreamBasin:${orgId}`, }); } catch (error) { - logger.warn("[setPlan] failed to enqueue stream basin reconfigure", { + logger.warn("[setPlan] failed to enqueue stream basin reconcile", { orgId, error: error instanceof Error ? error.message : String(error), }); diff --git a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts index ab758b87041..42b934b0574 100644 --- a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts @@ -1,53 +1,38 @@ /** - * Cloud-flavored shim that resolves a stream-basin retention duration - * from an org's current billing plan. + * Cloud-flavored shim that maps an org's billing plan to its + * stream-basin state — both whether it should have a dedicated basin + * at all, and what retention to apply if so. * * Kept deliberately separate from `streamBasinProvisioner.server.ts` * so the provisioner stays purely retention-string-driven and has no * coupling to plan vocabulary. This file is the only place in the - * webapp that maps "plan code" → "retention duration". + * webapp that maps "plan code" → "basin policy". * - * Operators that don't run a billing API just don't call this — the - * provisioner accepts retention strings directly, and the org-create - * path falls back to `defaultRetention()`. + * Operators that don't run a billing API never call this — orgs stay + * on the global shared basin via the existing read-precedence + * fallback. */ +import { prisma } from "~/db.server"; import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; import { getCurrentPlan, isBillingConfigured } from "~/services/platform.v3.server"; -import { defaultRetention } from "./streamBasinProvisioner.server"; +import { + defaultRetention, + provisionBasinForOrg, + reconfigureBasinForOrg, +} from "./streamBasinProvisioner.server"; /** - * Resolve the retention duration for an org based on its current plan. + * Plan codes that get a dedicated per-org basin. Free orgs (and + * unbilled / unknown plan codes) fall through to the shared global + * basin via the existing read-precedence fallback. * - * - When billing is **not configured** (OSS / self-hosted installs), - * returns `defaultRetention()` — the worker job converges, the - * backfill completes, and operators get a sane default without - * having to wire up a billing API. - * - When billing **is configured** and the call succeeds, maps the - * plan code to a retention duration. - * - When billing **is configured** but the call failed (transient - * outage / 5xx), **throws** so the redis-worker retry kicks in - * and we don't silently downgrade a paid org's retention. + * Adding a plan: drop its code in here AND in `retentionForPlanCode`. */ -export async function resolveRetentionForOrg(orgId: string): Promise { - if (!isBillingConfigured()) { - // No billing wired up — operator either runs OSS or hasn't set - // BILLING_API_URL / BILLING_API_KEY. Fall back to the default; - // the org-create path uses the same default, so this is just the - // backfill's catch-up path arriving at the same answer. - return defaultRetention(); - } +const PAID_PLAN_CODES = new Set(["v3_hobby_1", "v3_pro_1", "enterprise"]); - const plan = await getCurrentPlan(orgId); - if (plan === undefined) { - // Billing client exists but the call failed. Throw so redis-worker - // retries — silently defaulting to free would clip a paid org's - // retention if a backfill landed during a transient billing outage. - throw new Error( - `[streamBasinRetentionByPlan] billing plan unavailable for org ${orgId}; will retry` - ); - } - - return retentionForPlanCode(plan.v3Subscription?.plan?.code); +export function isPaidPlanCode(code: string | null | undefined): boolean { + return code != null && PAID_PLAN_CODES.has(code); } /** @@ -73,3 +58,99 @@ export function retentionForPlanCode(code: string | null | undefined): string { return defaultRetention(); } } + +type ReconcileResult = + | { kind: "skipped"; reason: "billing-not-configured" | "org-not-found" | "free-no-basin" } + | { kind: "provisioned"; retention: string } + | { kind: "reconfigured"; retention: string } + | { kind: "deprovisioned" }; + +/** + * Reconcile an org's basin state with its current plan. Idempotent; + * call whenever the plan changes or in a backfill loop. + * + * Transitions: + * + * plan paid + no basin → provision a new basin, stamp column. + * plan paid + has basin → reconfigure retention (tier may have + * changed). S2 retention only applies to + * *new* streams, but that's fine — old + * ones live out their original retention. + * plan free + has basin → null the column. New runs/sessions for + * this org route through the shared global + * basin. The per-org basin lingers until + * its existing streams expire on their + * original retention; no S2-side cleanup + * happens here. + * plan free + no basin → no-op. + * + * OSS / non-billing installs always hit the no-op path because + * `isBillingConfigured()` is false. Free-by-default. + * + * Throws on transient billing failure so redis-worker retries — + * silently defaulting to "free" during an outage would deprovision a + * paid org's basin and lose isolation. + */ +export async function reconcileBasinForOrg(orgId: string): Promise { + if (!isBillingConfigured()) { + return { kind: "skipped", reason: "billing-not-configured" }; + } + + const plan = await getCurrentPlan(orgId); + if (plan === undefined) { + throw new Error( + `[streamBasinReconciler] billing plan unavailable for org ${orgId}; will retry` + ); + } + + const planCode = plan.v3Subscription?.plan?.code; + const paid = isPaidPlanCode(planCode); + + const org = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { id: true, streamBasinName: true }, + }); + if (!org) { + return { kind: "skipped", reason: "org-not-found" }; + } + + if (paid && !org.streamBasinName) { + const retention = retentionForPlanCode(planCode); + await provisionBasinForOrg({ id: org.id, streamBasinName: null, retention }); + logger.info("[streamBasinReconciler] provisioned (paid upgrade)", { + orgId, + planCode, + retention, + }); + return { kind: "provisioned", retention }; + } + + if (paid && org.streamBasinName) { + const retention = retentionForPlanCode(planCode); + await reconfigureBasinForOrg(org.id, retention); + logger.info("[streamBasinReconciler] reconfigured (paid tier change)", { + orgId, + planCode, + retention, + }); + return { kind: "reconfigured", retention }; + } + + if (!paid && org.streamBasinName) { + // Downgrade. Don't touch S2 — basin lingers, old streams keep their + // original retention until they age out. Just unstamp the org so + // future runs/sessions flow to the shared global basin. + await prisma.organization.update({ + where: { id: org.id }, + data: { streamBasinName: null }, + }); + logger.info("[streamBasinReconciler] deprovisioned (downgrade to free)", { + orgId, + planCode, + previousBasin: org.streamBasinName, + }); + return { kind: "deprovisioned" }; + } + + return { kind: "skipped", reason: "free-no-basin" }; +} diff --git a/apps/webapp/app/v3/commonWorker.server.ts b/apps/webapp/app/v3/commonWorker.server.ts index f98e3c834be..a4366c042d7 100644 --- a/apps/webapp/app/v3/commonWorker.server.ts +++ b/apps/webapp/app/v3/commonWorker.server.ts @@ -21,12 +21,7 @@ import { ResumeTaskDependencyService } from "./services/resumeTaskDependency.ser import { RetryAttemptService } from "./services/retryAttempt.server"; import { TimeoutDeploymentService } from "./services/timeoutDeployment.server"; import { BulkActionService } from "./services/bulk/BulkActionV2.server"; -import { - provisionBasinForOrg, - reconfigureBasinForOrg, -} from "~/services/realtime/streamBasinProvisioner.server"; -import { resolveRetentionForOrg } from "~/services/realtime/streamBasinRetentionByPlan.server"; -import { prisma } from "~/db.server"; +import { reconcileBasinForOrg } from "~/services/realtime/streamBasinRetentionByPlan.server"; function initializeWorker() { const redisOptions = { @@ -205,16 +200,7 @@ function initializeWorker() { maxAttempts: 5, }, }, - "v3.provisionStreamBasinForOrg": { - schema: z.object({ - orgId: z.string(), - }), - visibilityTimeoutMs: 60_000, - retry: { - maxAttempts: 5, - }, - }, - "v3.reconfigureStreamBasinForOrg": { + "v3.reconcileStreamBasinForOrg": { schema: z.object({ orgId: z.string(), }), @@ -306,31 +292,13 @@ function initializeWorker() { const service = new BulkActionService(); await service.process(payload.bulkActionId); }, - "v3.provisionStreamBasinForOrg": async ({ payload }) => { - // Backfill / retry path. Resolves the retention for the org - // (cloud installs map plan→retention via the byPlan shim; - // others fall back to the default), then hands a plain - // retention string to the provisioner. The provisioner itself - // has no plan vocabulary. `resolveRetentionForOrg` throws on - // transient billing failure so redis-worker retries naturally. - const org = await prisma.organization.findFirst({ - where: { id: payload.orgId }, - select: { - id: true, - streamBasinName: true, - }, - }); - if (!org) return; - - const retention = await resolveRetentionForOrg(payload.orgId); - await provisionBasinForOrg({ ...org, retention }); - }, - "v3.reconfigureStreamBasinForOrg": async ({ payload }) => { - // Same shape as provision: resolve retention up front, hand a - // plain string to the provisioner. The shim throws on billing - // failure rather than silently downgrading retention. - const retention = await resolveRetentionForOrg(payload.orgId); - await reconfigureBasinForOrg(payload.orgId, retention); + "v3.reconcileStreamBasinForOrg": async ({ payload }) => { + // Bring the org's basin state in line with its current plan: + // provision on free→paid, reconfigure retention on tier change, + // null the column on paid→free (the basin itself lingers; old + // streams age out naturally). Idempotent — safe to enqueue + // from setPlan branches and the backfill loop. + await reconcileBasinForOrg(payload.orgId); }, }, }); From 97eb08ec5eebb435f91ac1a38db6f93096fee68b Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 18:32:52 +0100 Subject: [PATCH 07/13] fix(webapp): address review on per-org basin migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two correctness fixes flagged in PR review. - Session-stream race-check resolves basin from `{ session }` only. The append-side writer in `realtime.v1.sessions.$session.$io.append.ts` passes only `{ session }`, and `resolveStreamBasin` prefers `run` over `session` when both are present. During the migration window `run.streamBasinName` and `session.streamBasinName` can differ — writes land in the session's basin, so the race-check has to read from the same one or it falls through to the redis path silently. - Backfill admin route now supports cursor pagination via `afterOrgId` + `nextAfterOrgId`, so deployments with more orgs than `limit` (max 10k per call) can actually page through. `remaining` now counts orgs strictly past the cursor returned, matching the dry-run semantics. --- .../admin.api.v1.stream-basins.backfill.ts | 39 ++++++++++++++----- ...uns.$runFriendlyId.session-streams.wait.ts | 10 ++++- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts index 991548f8332..90f794a7a1c 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts @@ -19,8 +19,9 @@ import { logger } from "~/services/logger.server"; * so OSS / s2-lite installs can't accidentally trigger basin * operations against a misconfigured backend. * - `dryRun=true` (default false) returns the count without enqueueing. - * - `limit` (default 1000, max 10000) caps a single invocation. Run - * again with the next batch. + * - `limit` (default 1000, max 10000) caps a single invocation. To + * page through more orgs than `limit`, pass `afterOrgId` from the + * previous response's `nextAfterOrgId`. * - Each job is keyed `reconcileStreamBasin:` so concurrent * calls converge to one job per org. */ @@ -29,6 +30,7 @@ const BodySchema = z .object({ dryRun: z.boolean().optional().default(false), limit: z.number().int().min(1).max(10_000).optional().default(1000), + afterOrgId: z.string().optional(), }) .strict(); @@ -39,6 +41,7 @@ type BackfillResponse = { pending: number; remaining: number; orgIds: string[]; + nextAfterOrgId: string | null; }; export async function action({ request }: ActionFunctionArgs) { @@ -68,22 +71,32 @@ export async function action({ request }: ActionFunctionArgs) { return json({ ok: false, error: "Invalid JSON body" }, { status: 400 }); } - const { dryRun, limit } = parsed; + const { dryRun, limit, afterOrgId } = parsed; // Walk every non-deleted org. The reconcile worker is fast for the // no-op case (free with null column) so enqueueing for all is fine // — saves us from doing per-org billing lookups here just to filter - // candidates. + // candidates. Cursor on `id` (cuid is sortable) gives stable paging + // across calls; `createdAt` ties get broken by the cursor. const candidates = await prisma.organization.findMany({ where: { deletedAt: null }, - orderBy: { createdAt: "asc" }, + orderBy: { id: "asc" }, take: limit, + ...(afterOrgId ? { cursor: { id: afterOrgId }, skip: 1 } : {}), select: { id: true }, }); - const totalOrgs = await prisma.organization.count({ - where: { deletedAt: null }, - }); + const lastReturnedId = candidates[candidates.length - 1]?.id; + const nextAfterOrgId = candidates.length === limit && lastReturnedId ? lastReturnedId : null; + + // Orgs still beyond the cursor we just returned. On the final page, + // `lastReturnedId` is undefined (empty result) or the response is short + // of `limit`, so this is 0 — exactly what the caller needs to stop. + const remaining = lastReturnedId + ? await prisma.organization.count({ + where: { deletedAt: null, id: { gt: lastReturnedId } }, + }) + : 0; if (dryRun) { const response: BackfillResponse = { @@ -91,8 +104,9 @@ export async function action({ request }: ActionFunctionArgs) { dryRun: true, enqueued: 0, pending: candidates.length, - remaining: Math.max(0, totalOrgs - candidates.length), + remaining, orgIds: candidates.map((o) => o.id), + nextAfterOrgId, }; return json(response); } @@ -114,13 +128,18 @@ export async function action({ request }: ActionFunctionArgs) { } } + // `remaining` counts orgs strictly past the cursor returned to the + // caller. Enqueue failures don't change this — re-running with the + // same `afterOrgId` would page through the same window and the + // per-org idempotency key keeps it safe. const response: BackfillResponse = { ok: true, dryRun: false, enqueued, pending: candidates.length, - remaining: Math.max(0, totalOrgs - enqueued), + remaining, orgIds: candidates.map((o) => o.id), + nextAfterOrgId, }; logger.info("[stream-basins-backfill] enqueued reconcile jobs", { diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index 8e2140ad04c..90338726799 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -47,7 +47,6 @@ const { action, loader } = createActionApiRoute( id: true, friendlyId: true, realtimeStreamsVersion: true, - streamBasinName: true, }, }); @@ -129,8 +128,15 @@ const { action, loader } = createActionApiRoute( // hardcode "v2", so the race-check reader has to match. // Don't fall through to the run's own `realtimeStreamsVersion`, // which only describes the run's run-scoped streams. + // + // Resolve basin from `session` only (not `run`). The append-side + // writer in `realtime.v1.sessions.$session.$io.append.ts` passes + // only `{ session }`, and `resolveStreamBasin` prefers `run` over + // `session` when both are present. During the per-org-basin + // migration window, `run.streamBasinName` and + // `session.streamBasinName` can differ — the writes land in the + // session's basin, so the race-check has to read from the same. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { - run, session: maybeSession, }); From 871b9936591c7a3e87ed6ceb3c7a81d84cfd4fa1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 19:43:21 +0100 Subject: [PATCH 08/13] fix(webapp): early-return reconcile when per-org basins disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this guard `reconcileBasinForOrg` would still call into `provisionBasinForOrg` / `reconfigureBasinForOrg`, which both no-op behind the feature flag, but the reconciler then logged "provisioned (paid upgrade)" and returned `{ kind: "provisioned" }`. Misleading on a cloud install where billing is wired but per-org basins are off — the logs claim work that didn't happen, and we paid for a billing API round-trip we couldn't act on. Bail at the top with `{ kind: "skipped", reason: "feature-disabled" }` so the result and the logs match the actual no-op behaviour. --- .../streamBasinRetentionByPlan.server.ts | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts index 42b934b0574..d35471d11e0 100644 --- a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts @@ -18,6 +18,7 @@ import { logger } from "~/services/logger.server"; import { getCurrentPlan, isBillingConfigured } from "~/services/platform.v3.server"; import { defaultRetention, + isPerOrgBasinsEnabled, provisionBasinForOrg, reconfigureBasinForOrg, } from "./streamBasinProvisioner.server"; @@ -60,7 +61,14 @@ export function retentionForPlanCode(code: string | null | undefined): string { } type ReconcileResult = - | { kind: "skipped"; reason: "billing-not-configured" | "org-not-found" | "free-no-basin" } + | { + kind: "skipped"; + reason: + | "billing-not-configured" + | "feature-disabled" + | "org-not-found" + | "free-no-basin"; + } | { kind: "provisioned"; retention: string } | { kind: "reconfigured"; retention: string } | { kind: "deprovisioned" }; @@ -96,6 +104,15 @@ export async function reconcileBasinForOrg(orgId: string): Promise Date: Mon, 4 May 2026 21:00:59 +0100 Subject: [PATCH 09/13] fix(webapp): row-optional session-channel routes default to org basin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PUT /realtime/v1/sessions/:session/:io and the SSE GET loader on the same path are row-optional — `:session` may be a `chatId` (externalId) that hasn't been upserted yet. When the row is missing, both used to fall through `resolveStreamBasin` to the legacy global basin. If the row was then created with a per-org basin stamp, follow-up appends and SSE subscribes resolved to per-org while the PUT-returned headers still pointed at legacy — caller writes via those headers landed in the wrong place. Resolve via the org's current basin when the row is absent. A fresh session row would be stamped with that same basin at create time, so all subsequent ops converge. Pre-migration rows (row exists, column null) keep their legacy fallback because `organization` is only passed in the no-row branch — `session.streamBasinName === null` still falls through to the env var, not to the org column. Verified by curl: PUT against a fresh externalId for an org with a per-org basin returns `X-S2-Basin: triggerdotdev-dev-org-`; same call for a free-org key still returns the legacy basin. --- .../routes/realtime.v1.sessions.$session.$io.ts | 15 +++++++++++++++ .../services/realtime/v1StreamsGlobal.server.ts | 12 +++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index 36a30761b01..5aff04cfec5 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -59,8 +59,18 @@ const { action } = createActionApiRoute( }); } + // When the row is missing (externalId form, row not yet upserted), + // default to the org's current basin instead of falling through to + // the legacy global. A fresh session row would be stamped with the + // org's basin at creation time, so subsequent appends/subscribes + // would resolve to the same place — without this, PUT-returned + // headers point at legacy and the actual writes go to per-org once + // the row exists. Pre-migration rows (row exists, column null) keep + // their existing legacy behaviour because we only fall back to org + // when there's no row at all. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { session: maybeSession, + organization: maybeSession ? null : authentication.environment.organization, }); if (!(realtimeStream instanceof S2RealtimeStreams)) { @@ -124,8 +134,13 @@ const loader = createLoaderApiRoute( }, }, async ({ params, request, authentication, resource }) => { + // Same row-optional reasoning as the PUT handler above: if no row + // exists yet, resolve via the org's current basin so the SSE + // subscribe lands in the same place that subsequent appends will + // (once the row gets created and stamped). const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { session: resource.row, + organization: resource.row ? null : authentication.environment.organization, }); if (!(realtimeStream instanceof S2RealtimeStreams)) { diff --git a/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts b/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts index a43d0e4e444..2d9330cfa7c 100644 --- a/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts +++ b/apps/webapp/app/services/realtime/v1StreamsGlobal.server.ts @@ -34,7 +34,10 @@ export const v1RealtimeStreams = singleton("realtimeStreams", initializeRedisRea * * 1. `run.streamBasinName` (set at trigger time, immutable per-run) * 2. `session.streamBasinName` (set at session create time) - * 3. `REALTIME_STREAMS_S2_BASIN` (the legacy / OSS / pre-backfill global) + * 3. `organization.streamBasinName` (current org basin — only useful + * when neither a run nor a session row exists yet, e.g. PUT init + * against an externalId before the row is created) + * 4. `REALTIME_STREAMS_S2_BASIN` (the legacy / OSS / pre-backfill global) * * Old runs / sessions that pre-date the per-org-basins migration carry * `null` columns and fall through to the global basin, which is the @@ -42,6 +45,11 @@ export const v1RealtimeStreams = singleton("realtimeStreams", initializeRedisRea * drains via S2 retention (~30d on prod today), this fallback can be * dropped — but it's cheap to keep as a safety net. * + * Callers should only pass `organization` when they know the row-bearing + * ref is absent (not when its column is null) — otherwise a pre-migration + * row's null column would short-circuit to the org's *current* basin + * instead of the legacy one its streams actually live in. + * * OSS / s2-lite installs always hit the global path because the * provisioner is gated by `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED` * and `streamBasinName` is never written. @@ -49,12 +57,14 @@ export const v1RealtimeStreams = singleton("realtimeStreams", initializeRedisRea export type StreamBasinContext = { run?: { streamBasinName: string | null } | null; session?: { streamBasinName: string | null } | null; + organization?: { streamBasinName: string | null } | null; }; export function resolveStreamBasin(ctx: StreamBasinContext): string | undefined { return ( ctx.run?.streamBasinName ?? ctx.session?.streamBasinName ?? + ctx.organization?.streamBasinName ?? env.REALTIME_STREAMS_S2_BASIN ?? undefined ); From f55746db7671f53cbed294ffcfe9a8727ec5e51a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 21:13:45 +0100 Subject: [PATCH 10/13] docs(webapp): clarify reconfigure admin route's default vs retention paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSDoc referenced `v3.reconfigureStreamBasinForOrg` (a job that doesn't exist — the actual one is `v3.reconcileStreamBasinForOrg`) and didn't make clear that the default path runs the full reconciler, which can deprovision a basin if the org is now on a free plan. Spell that out so an operator hitting this route by hand isn't surprised. --- .../admin.api.v1.stream-basins.reconfigure.ts | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts index 866045bbe47..a47c4c28125 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -9,17 +9,21 @@ import { import { commonWorker } from "~/v3/commonWorker.server"; /** - * Admin trigger for `v3.reconfigureStreamBasinForOrg`. The plan-change - * path in `setPlan` enqueues this automatically when billing is wired; - * this route exists for ops + e2e testing. + * Admin trigger for stream-basin reconfiguration. The plan-change path + * in `setPlan` enqueues the same reconcile job automatically when + * billing is wired; this route exists for ops + e2e testing. * - * - Default (`{ orgId }`): enqueues the worker job which resolves the - * retention from the org's plan and PATCHes the basin to match. - * No-op when billing isn't configured (OSS). - * - With `retention`: bypasses the billing lookup and runs reconfigure - * inline against the given duration string (e.g. `"7d"`, `"30d"`, - * `"365d"`, `"1y"`). Useful for validating the PATCH wire shape - * end-to-end and as a manual override (e.g. enterprise contracts). + * - Default (`{ orgId }`): enqueues `v3.reconcileStreamBasinForOrg`, + * the full reconciler. It resolves retention from the org's current + * plan and either provisions, reconfigures, or deprovisions the basin + * to match — including nulling `streamBasinName` if the org is now on + * a free plan. No-op when billing isn't configured (OSS) or when + * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false`. + * - With `retention`: skips the worker queue and the reconciler entirely. + * Calls `reconfigureBasinForOrg` inline with the given duration string + * (e.g. `"7d"`, `"30d"`, `"365d"`, `"1y"`). Useful for validating the + * PATCH wire shape end-to-end and as a manual override (e.g. + * enterprise contracts) — does NOT touch the column or check the plan. */ const BodySchema = z .object({ From 684fc2e392d2b48970f8bba9cff9dcb41ea5b8ea Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 21:25:59 +0100 Subject: [PATCH 11/13] chore(webapp): trim per-org-basin comments Pass over the basin-related code to drop running commentary and cloud-product-specific phrasing. No behaviour change. - streamBasinProvisioner.server.ts: shorter module + helper docblocks; drop stale "synchronous org-create call site" rationale that no longer applies after the paid-only refactor. - streamBasinRetentionByPlan.server.ts: tighter module doc; collapse the reconcile-transitions narrative into a short table; drop the "cloud-flavored" framing. - v1StreamsGlobal.server.ts: short doc on resolveStreamBasin; drop references to specific operational state. - env.server.ts: terse one-liner per env var; drop sample basin name example. - platform.v3.server.ts, commonWorker.server.ts, runEngine/types.ts + index.ts, triggerTask.server.ts, api.v1.sessions.ts, the two admin routes and the two row-optional session-channel handlers: drop inline rationale paragraphs that re-explained the reconciler / read-precedence chain at every call site. --- apps/webapp/app/env.server.ts | 33 ++---- .../admin.api.v1.stream-basins.backfill.ts | 37 +----- .../admin.api.v1.stream-basins.reconfigure.ts | 24 ++-- ...uns.$runFriendlyId.session-streams.wait.ts | 17 +-- apps/webapp/app/routes/api.v1.sessions.ts | 6 - .../realtime.v1.sessions.$session.$io.ts | 17 +-- .../runEngine/services/triggerTask.server.ts | 4 - .../webapp/app/services/platform.v3.server.ts | 23 +--- .../realtime/streamBasinProvisioner.server.ts | 107 +++--------------- .../streamBasinRetentionByPlan.server.ts | 84 ++++---------- .../realtime/v1StreamsGlobal.server.ts | 36 ++---- apps/webapp/app/v3/commonWorker.server.ts | 5 - .../run-engine/src/engine/index.ts | 3 - .../run-engine/src/engine/types.ts | 4 - 14 files changed, 78 insertions(+), 322 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 4518295e14b..39b5c7804fe 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -5,12 +5,8 @@ import { isValidDatabaseUrl } from "./utils/db"; import { isValidRegex } from "./utils/regex"; import { isValidDuration } from "./services/realtime/duration.server"; -/** - * `z.string()` constrained to a duration string parseable by - * `parseDuration` (e.g. `7d`, `30d`, `365d`, `1h`). Validated at boot - * so a typo'd retention env var fails fast at startup rather than - * lurking until the first basin operation. - */ +// `z.string()` constrained to a `parseDuration`-parseable string (e.g. +// `7d`, `1h`). Validated at boot so a typo'd duration fails fast. function durationString() { return z .string() @@ -1519,33 +1515,20 @@ const EnvironmentSchema = z REALTIME_STREAMS_S2_FLUSH_INTERVAL_MS: z.coerce.number().int().default(100), REALTIME_STREAMS_S2_MAX_RETRIES: z.coerce.number().int().default(10), REALTIME_STREAMS_S2_WAIT_SECONDS: z.coerce.number().int().default(60), - /// Per-org basin migration. When "true", the webapp provisions a - /// dedicated S2 basin per org with plan-tied retention and stamps - /// `streamBasinName` on new TaskRun / Session rows. OSS / s2-lite - /// installs leave this off and keep using the single basin defined - /// by `REALTIME_STREAMS_S2_BASIN`. + // When "true", provision a dedicated S2 basin per org and stamp + // `streamBasinName` on new rows. Off keeps everything on the single + // basin defined by `REALTIME_STREAMS_S2_BASIN`. REALTIME_STREAMS_PER_ORG_BASINS_ENABLED: z.enum(["true", "false"]).default("false"), - /// Naming pattern for per-org basins: `{prefix}-{env}-org-{slug}` - /// e.g. `triggerdotdev-prod-org-acme-corp`. Cluster + tier shorthand - /// — kept short to stay under S2's basin-name length limit. + // Per-org basin name = `{prefix}-{env}-org-{orgId}`. REALTIME_STREAMS_BASIN_NAME_PREFIX: z.string().default("triggerdotdev"), REALTIME_STREAMS_BASIN_NAME_ENV: z.string().default("dev"), - /// Default retention for new basins (S2 duration syntax: 7d / 30d / 1y). - /// Used at org-create and as the fallback when no plan-specific - /// retention is resolved. Operators that don't run a billing API - /// only need this one. REALTIME_STREAMS_BASIN_DEFAULT_RETENTION: durationString().default("30d"), - /// Plan-specific retention overrides — only consulted by the - /// optional `streamBasinRetentionByPlan` shim. Operators that - /// don't map plans to retention (OSS, self-hosted) can ignore - /// these and rely on the default above. + // Plan-specific retention overrides consulted by the + // streamBasinRetentionByPlan shim only. REALTIME_STREAMS_BASIN_RETENTION_FREE: durationString().default("7d"), REALTIME_STREAMS_BASIN_RETENTION_HOBBY: durationString().default("30d"), REALTIME_STREAMS_BASIN_RETENTION_PRO: durationString().default("365d"), - /// Storage class applied to per-org basins at create time. REALTIME_STREAMS_BASIN_STORAGE_CLASS: z.enum(["express", "standard"]).default("express"), - /// `delete_on_empty_min_age` applied to per-org basins. Streams - /// that go empty for this long are reaped automatically. REALTIME_STREAMS_BASIN_DELETE_ON_EMPTY_MIN_AGE: durationString().default("1h"), REALTIME_STREAMS_DEFAULT_VERSION: z.enum(["v1", "v2"]).default("v1"), WAIT_UNTIL_TIMEOUT_MS: z.coerce.number().int().default(600_000), diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts index 90f794a7a1c..830307d923a 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts @@ -7,23 +7,9 @@ import { commonWorker } from "~/v3/commonWorker.server"; import { logger } from "~/services/logger.server"; /** - * One-shot backfill that enqueues `v3.reconcileStreamBasinForOrg` for - * every non-deleted org. The reconciler decides per-org what to do: - * provision a basin for paid orgs that don't have one, reconfigure - * retention for paid orgs whose tier changed, deprovision (null the - * column) for free orgs that were mistakenly provisioned. Idempotent - * — re-running converges to the desired state. - * - * - Admin auth via `requireAdminApiRequest` (PAT in `Authorization`). - * - Refuses to run when `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false` - * so OSS / s2-lite installs can't accidentally trigger basin - * operations against a misconfigured backend. - * - `dryRun=true` (default false) returns the count without enqueueing. - * - `limit` (default 1000, max 10000) caps a single invocation. To - * page through more orgs than `limit`, pass `afterOrgId` from the - * previous response's `nextAfterOrgId`. - * - Each job is keyed `reconcileStreamBasin:` so concurrent - * calls converge to one job per org. + * Backfill: enqueue `v3.reconcileStreamBasinForOrg` for every + * non-deleted org. Idempotent. Page through `>limit` orgs by passing + * `afterOrgId` from the previous response's `nextAfterOrgId`. */ const BodySchema = z @@ -73,11 +59,8 @@ export async function action({ request }: ActionFunctionArgs) { const { dryRun, limit, afterOrgId } = parsed; - // Walk every non-deleted org. The reconcile worker is fast for the - // no-op case (free with null column) so enqueueing for all is fine - // — saves us from doing per-org billing lookups here just to filter - // candidates. Cursor on `id` (cuid is sortable) gives stable paging - // across calls; `createdAt` ties get broken by the cursor. + // Reconcile is fast for the no-op case, so we enqueue for all orgs + // rather than filter on plan here. const candidates = await prisma.organization.findMany({ where: { deletedAt: null }, orderBy: { id: "asc" }, @@ -89,9 +72,6 @@ export async function action({ request }: ActionFunctionArgs) { const lastReturnedId = candidates[candidates.length - 1]?.id; const nextAfterOrgId = candidates.length === limit && lastReturnedId ? lastReturnedId : null; - // Orgs still beyond the cursor we just returned. On the final page, - // `lastReturnedId` is undefined (empty result) or the response is short - // of `limit`, so this is 0 — exactly what the caller needs to stop. const remaining = lastReturnedId ? await prisma.organization.count({ where: { deletedAt: null, id: { gt: lastReturnedId } }, @@ -128,10 +108,6 @@ export async function action({ request }: ActionFunctionArgs) { } } - // `remaining` counts orgs strictly past the cursor returned to the - // caller. Enqueue failures don't change this — re-running with the - // same `afterOrgId` would page through the same window and the - // per-org idempotency key keeps it safe. const response: BackfillResponse = { ok: true, dryRun: false, @@ -151,8 +127,7 @@ export async function action({ request }: ActionFunctionArgs) { return json(response); } -// GET returns the current state without doing anything — useful for -// monitoring "is the backfill done yet?" from a dashboard / curl. +// GET: read-only progress — orgs with vs without a basin stamped. export async function loader({ request }: ActionFunctionArgs) { await requireAdminApiRequest(request); diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts index a47c4c28125..2039a080530 100644 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts @@ -9,21 +9,14 @@ import { import { commonWorker } from "~/v3/commonWorker.server"; /** - * Admin trigger for stream-basin reconfiguration. The plan-change path - * in `setPlan` enqueues the same reconcile job automatically when - * billing is wired; this route exists for ops + e2e testing. + * Admin route for forcing a basin reconfigure for an org. Two modes: * - * - Default (`{ orgId }`): enqueues `v3.reconcileStreamBasinForOrg`, - * the full reconciler. It resolves retention from the org's current - * plan and either provisions, reconfigures, or deprovisions the basin - * to match — including nulling `streamBasinName` if the org is now on - * a free plan. No-op when billing isn't configured (OSS) or when - * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false`. - * - With `retention`: skips the worker queue and the reconciler entirely. - * Calls `reconfigureBasinForOrg` inline with the given duration string - * (e.g. `"7d"`, `"30d"`, `"365d"`, `"1y"`). Useful for validating the - * PATCH wire shape end-to-end and as a manual override (e.g. - * enterprise contracts) — does NOT touch the column or check the plan. + * - `{ orgId }`: enqueues `v3.reconcileStreamBasinForOrg` (the full + * reconciler). May provision, reconfigure, or deprovision based on + * the org's current plan. + * - `{ orgId, retention }`: bypasses the reconciler and PATCHes the + * basin retention inline against the given duration. Doesn't touch + * the column or check the plan. */ const BodySchema = z .object({ @@ -58,9 +51,6 @@ export async function action({ request }: ActionFunctionArgs) { } if (parsed.data.retention) { - // Direct, synchronous reconfigure with the explicit retention. - // Skips the worker queue + billing lookup so the PATCH is - // verifiable in the response. Errors surface as 500. await reconfigureBasinForOrg(parsed.data.orgId, parsed.data.retention); return json({ ok: true, diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index 90338726799..ed1304f4349 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -123,19 +123,10 @@ const { action, loader } = createActionApiRoute( // and remove the pending registration. if (!result.isCached) { try { - // Session streams are always v2 (S2) — the writer in - // `appendPartToSessionStream` and the SSE subscribe both - // hardcode "v2", so the race-check reader has to match. - // Don't fall through to the run's own `realtimeStreamsVersion`, - // which only describes the run's run-scoped streams. - // - // Resolve basin from `session` only (not `run`). The append-side - // writer in `realtime.v1.sessions.$session.$io.append.ts` passes - // only `{ session }`, and `resolveStreamBasin` prefers `run` over - // `session` when both are present. During the per-org-basin - // migration window, `run.streamBasinName` and - // `session.streamBasinName` can differ — the writes land in the - // session's basin, so the race-check has to read from the same. + // Session streams are hardcoded v2 by the append-side writer + // and SSE subscribe, so the race-check reader matches. Basin + // comes from `session` only — the writer side passes the same + // and we have to read from the same basin to find the record. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { session: maybeSession, }); diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index 708d791ff49..eafb0f7a20c 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -167,9 +167,6 @@ const { action } = createActionApiRoute( runtimeEnvironmentId: authentication.environment.id, environmentType: authentication.environment.type, organizationId: authentication.environment.organizationId, - // Stamp the org's S2 basin so realtime reads on this - // session's `.in/.out` channels resolve without joining - // Organization. Null until per-org basins are provisioned. streamBasinName: authentication.environment.organization.streamBasinName, }, update: { triggerConfig: triggerConfigJson }, @@ -190,9 +187,6 @@ const { action } = createActionApiRoute( runtimeEnvironmentId: authentication.environment.id, environmentType: authentication.environment.type, organizationId: authentication.environment.organizationId, - // Stamp the org's S2 basin so realtime reads on this - // session's `.in/.out` channels resolve without joining - // Organization. Null until per-org basins are provisioned. streamBasinName: authentication.environment.organization.streamBasinName, }, }); diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index 5aff04cfec5..37ec58c51ae 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -59,15 +59,9 @@ const { action } = createActionApiRoute( }); } - // When the row is missing (externalId form, row not yet upserted), - // default to the org's current basin instead of falling through to - // the legacy global. A fresh session row would be stamped with the - // org's basin at creation time, so subsequent appends/subscribes - // would resolve to the same place — without this, PUT-returned - // headers point at legacy and the actual writes go to per-org once - // the row exists. Pre-migration rows (row exists, column null) keep - // their existing legacy behaviour because we only fall back to org - // when there's no row at all. + // No-row form: resolve via the org so the stream initialised here + // matches what later appends/subscribes will land on once the row + // is created. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { session: maybeSession, organization: maybeSession ? null : authentication.environment.organization, @@ -134,10 +128,7 @@ const loader = createLoaderApiRoute( }, }, async ({ params, request, authentication, resource }) => { - // Same row-optional reasoning as the PUT handler above: if no row - // exists yet, resolve via the org's current basin so the SSE - // subscribe lands in the same place that subsequent appends will - // (once the row gets created and stamped). + // Same no-row fallback as PUT above. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { session: resource.row, organization: resource.row ? null : authentication.environment.organization, diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts index 0d58b607b69..445e0eb155a 100644 --- a/apps/webapp/app/runEngine/services/triggerTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts @@ -395,10 +395,6 @@ export class RunEngineTriggerTaskService { bulkActionId: body.options?.bulkActionId, planType, realtimeStreamsVersion: options.realtimeStreamsVersion, - // Stamp the org's S2 basin onto the new TaskRun so - // realtime read paths can resolve the basin without - // joining `Organization`. Null in OSS / pre-backfill; - // reads then fall back to the global basin env var. streamBasinName: environment.organization.streamBasinName, debounce: body.options?.debounce, annotations, diff --git a/apps/webapp/app/services/platform.v3.server.ts b/apps/webapp/app/services/platform.v3.server.ts index 1e312c5e7ae..265d73b9cb7 100644 --- a/apps/webapp/app/services/platform.v3.server.ts +++ b/apps/webapp/app/services/platform.v3.server.ts @@ -434,32 +434,15 @@ export async function setPlan( } } -/** - * Best-effort enqueue: when an org's plan changes we reconcile its - * stream-basin state. The reconciler handles every transition: - * - * free → paid: provision a dedicated basin with the plan's retention. - * paid → paid: reconfigure the existing basin's retention. - * paid → free: null `Organization.streamBasinName`. Future runs/sessions - * flow to the shared global basin; the per-org basin - * lingers until existing streams age out on their original - * retention. - * free → free: no-op. - * - * Idempotent and a no-op when per-org basins are disabled or billing - * isn't configured. Failures are logged but never block the plan - * change itself — billing has already accepted by the time we reach - * this code. - */ +// Best-effort: failures are logged but never block the plan change. +// The reconciler is idempotent and re-reads the plan when it runs, so +// concurrent plan changes collapse to one pending job per org. async function enqueueStreamBasinReconcile(orgId: string) { try { const { commonWorker } = await import("~/v3/commonWorker.server"); await commonWorker.enqueue({ job: "v3.reconcileStreamBasinForOrg", payload: { orgId }, - // Per-org dedupe key — concurrent plan changes collapse to one - // pending reconcile job. The job re-reads the current plan when - // it executes, so the latest tier wins. id: `reconcileStreamBasin:${orgId}`, }); } catch (error) { diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index 42eb7cdf1aa..97a7f05b59e 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -1,34 +1,10 @@ /** - * Per-org S2 basin provisioning. + * Per-org S2 basin provisioning. Gated by + * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED`: when off, all orgs share + * `REALTIME_STREAMS_S2_BASIN` and this module no-ops. * - * The webapp runs in two modes for realtime stream storage: - * - * - **Single-basin mode** (OSS / s2-lite installs): - * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=false`. All orgs share the - * basin in `REALTIME_STREAMS_S2_BASIN`. `Organization.streamBasinName` - * stays null forever; reads / writes resolve to the global basin. - * - * - **Per-org-basin mode**: - * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true`. Each org gets a - * dedicated basin with its own retention. The basin is the unit of - * cost attribution (S2 exposes per-basin metrics) and isolation - * (access tokens scope to one basin). - * - * This module is purely retention-string-driven: callers pass a - * duration like `"30d"` and the provisioner does the S2 round-trip. - * It has no concept of plans / tiers / billing — operators that want - * per-tier retention live one layer up (see - * `streamBasinRetentionByPlan.server.ts`). - * - * Provisioning is one-shot per org: at creation time (or via the - * backfill worker job for existing orgs) we create the basin and stamp - * `Organization.streamBasinName`. New `TaskRun` / `Session` rows then - * piggyback on the existing org read in `triggerTask` / session-create - * paths and copy the value through. Reads use a precedence chain - * (`run.streamBasinName ?? session.streamBasinName ?? globalBasin`). - * - * Plan / retention changes update retention in-place via - * `reconfigureBasin`. We do not move data across basins. + * Pure retention-string in / S2-call out. No plan or billing + * vocabulary — that lives in `streamBasinRetentionByPlan.server.ts`. */ import type { PrismaClientOrTransaction } from "~/db.server"; import { prisma } from "~/db.server"; @@ -40,29 +16,13 @@ export function isPerOrgBasinsEnabled(): boolean { return env.REALTIME_STREAMS_PER_ORG_BASINS_ENABLED === "true"; } -/** - * Default retention for new orgs and any caller that doesn't specify - * a value. Configurable via `REALTIME_STREAMS_BASIN_DEFAULT_RETENTION`. - */ export function defaultRetention(): string { return env.REALTIME_STREAMS_BASIN_DEFAULT_RETENTION; } -/** - * Build the basin name for an org. Format: `{prefix}-{env}-org-{id}`. - * - * We use the org's `id` (cuid, fixed-length, unique-by-construction) - * rather than the slug. Slugs are user-influenced, can change, and — - * critically — could collide across orgs once truncated to fit the - * S2 basin-name length cap. cuid is short (25 chars) and never - * collides, so the basin name is stable and tenant-isolated by - * construction. - * - * Format check: `triggerdotdev-prod-org-{25 chars}` is 47 chars total, - * comfortably under the conventional 63-char cap. If you change the - * prefix / env-name to something extreme, this still fails fast at - * S2's validator. - */ +// Org id is a cuid — fixed-length and stable, so the basin name is +// collision-free without truncation. Slugs are user-editable and would +// drift. export function basinNameForOrg(org: { id: string }): string { const prefix = env.REALTIME_STREAMS_BASIN_NAME_PREFIX; const envName = env.REALTIME_STREAMS_BASIN_NAME_ENV; @@ -71,9 +31,6 @@ export function basinNameForOrg(org: { id: string }): string { type ProvisionInput = { id: string; - /// Duration string passed straight to S2. Defaults to - /// `defaultRetention()` when omitted. Caller decides; the provisioner - /// has no opinion about what retention is appropriate. retention?: string; streamBasinName: string | null | undefined; }; @@ -82,19 +39,8 @@ type ProvisionResult = | { kind: "skipped"; reason: "feature-disabled" | "already-provisioned"; basin: string | null } | { kind: "provisioned"; basin: string; retention: string }; -/** - * Idempotent: if the org already has `streamBasinName`, returns the - * existing value without contacting S2. Otherwise creates the basin - * (S2 returns 409 on race with another caller — we treat that as - * success) and writes the column. - * - * Failure modes: - * - S2 unreachable / 5xx / timeout: throws. Callers in the org-create - * path swallow + leave the column null so the backfill worker can - * retry, so signup never fails on a transient S2 outage. - * - Auth misconfig (no token): throws. Should never happen in - * per-org-basins mode but worth surfacing loudly. - */ +// Idempotent. Treats S2 409 as success (race with another caller, or +// previous run that crashed after S2 ack but before the column write). export async function provisionBasinForOrg( org: ProvisionInput, prismaClient: PrismaClientOrTransaction = prisma @@ -138,10 +84,6 @@ export async function provisionBasinForOrg( return { kind: "provisioned", basin, retention }; } -/** - * Update retention in-place. Idempotent. No-op when the org has no - * provisioned basin. - */ export async function reconfigureBasinForOrg( orgId: string, retention: string @@ -150,10 +92,6 @@ export async function reconfigureBasinForOrg( const accessToken = env.REALTIME_STREAMS_S2_ACCESS_TOKEN; if (!accessToken) { - // Per-org basins are enabled but no token is configured — that's a - // misconfiguration, not a no-op condition. Throw so the worker job - // surfaces in the queue's failure log instead of silently leaving - // retention stale on the basin. throw new Error( "REALTIME_STREAMS_S2_ACCESS_TOKEN must be set when REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true" ); @@ -174,19 +112,15 @@ export async function reconfigureBasinForOrg( }); } -// ---------- S2 REST ---------- -// -// Account-level API: `POST /v1/basins` to create, `PATCH /v1/basins/{name}` -// to reconfigure. The wire shape uses integer seconds for durations -// (`retention_policy.age`, `delete_on_empty.min_age_secs`) — the human -// strings (`7d`, `30d`, `1y`) are env-var ergonomics that we parse on -// the way out. +// S2 REST: POST /v1/basins to create, PATCH /v1/basins/{name} to +// reconfigure. Wire shape takes integer seconds; we accept human strings +// like "7d" / "1y" as env-var ergonomics and parse them here. type CreateBasinOptions = { accessToken: string; - retentionPolicy: string; // e.g. "7d", "30d", "365d" + retentionPolicy: string; storageClass: "express" | "standard"; - deleteOnEmptyMinAge: string; // e.g. "1h" + deleteOnEmptyMinAge: string; }; async function s2CreateBasin(name: string, opts: CreateBasinOptions): Promise { @@ -205,10 +139,6 @@ async function s2CreateBasin(name: string, opts: CreateBasinOptions): Promise ""); @@ -241,9 +169,6 @@ async function s2ReconfigureBasin(name: string, opts: ReconfigureBasinOptions): }; const res = await fetch(url, { - // Same 10s ceiling as create. The reconfigure path runs from the - // worker, so a timeout here just fails the job and lets redis-worker - // retry naturally. signal: AbortSignal.timeout(10_000), method: "PATCH", headers: { diff --git a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts index d35471d11e0..0d08782a65c 100644 --- a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts @@ -1,16 +1,8 @@ /** - * Cloud-flavored shim that maps an org's billing plan to its - * stream-basin state — both whether it should have a dedicated basin - * at all, and what retention to apply if so. - * - * Kept deliberately separate from `streamBasinProvisioner.server.ts` - * so the provisioner stays purely retention-string-driven and has no - * coupling to plan vocabulary. This file is the only place in the - * webapp that maps "plan code" → "basin policy". - * - * Operators that don't run a billing API never call this — orgs stay - * on the global shared basin via the existing read-precedence - * fallback. + * Plan → basin policy. The only place that knows which plan codes + * earn a dedicated basin and what retention each gets. Operators + * without a billing API never reach this — orgs stay on the shared + * basin via the read-precedence fallback. */ import { prisma } from "~/db.server"; import { env } from "~/env.server"; @@ -23,27 +15,15 @@ import { reconfigureBasinForOrg, } from "./streamBasinProvisioner.server"; -/** - * Plan codes that get a dedicated per-org basin. Free orgs (and - * unbilled / unknown plan codes) fall through to the shared global - * basin via the existing read-precedence fallback. - * - * Adding a plan: drop its code in here AND in `retentionForPlanCode`. - */ +// Adding a plan: drop its code here AND in `retentionForPlanCode`. +// Exact-match against a known set; substring matching could grant the +// wrong tier (e.g. `"approved"` would match `"pro"`). const PAID_PLAN_CODES = new Set(["v3_hobby_1", "v3_pro_1", "enterprise"]); export function isPaidPlanCode(code: string | null | undefined): boolean { return code != null && PAID_PLAN_CODES.has(code); } -/** - * Map a plan code to a retention duration via env-var lookup. - * - * Exact-match against a small known set rather than substring matching, - * since substring matching against future plan codes could grant the - * wrong tier (e.g. `"approved"` would match `"pro"`). Add a new code - * here when launching a new plan. - */ export function retentionForPlanCode(code: string | null | undefined): string { if (!code) return defaultRetention(); @@ -73,42 +53,25 @@ type ReconcileResult = | { kind: "reconfigured"; retention: string } | { kind: "deprovisioned" }; -/** - * Reconcile an org's basin state with its current plan. Idempotent; - * call whenever the plan changes or in a backfill loop. - * - * Transitions: - * - * plan paid + no basin → provision a new basin, stamp column. - * plan paid + has basin → reconfigure retention (tier may have - * changed). S2 retention only applies to - * *new* streams, but that's fine — old - * ones live out their original retention. - * plan free + has basin → null the column. New runs/sessions for - * this org route through the shared global - * basin. The per-org basin lingers until - * its existing streams expire on their - * original retention; no S2-side cleanup - * happens here. - * plan free + no basin → no-op. - * - * OSS / non-billing installs always hit the no-op path because - * `isBillingConfigured()` is false. Free-by-default. - * - * Throws on transient billing failure so redis-worker retries — - * silently defaulting to "free" during an outage would deprovision a - * paid org's basin and lose isolation. - */ +// Reconcile an org's basin state with its current plan. Idempotent. +// +// paid + no basin → provision, stamp column. +// paid + has basin → reconfigure retention (in case the tier changed). +// free + has basin → null the column; basin lingers until its streams +// age out on their original retention. +// free + no basin → no-op. +// +// Throws on transient billing failure so redis-worker retries — +// silently defaulting to "free" during an outage would deprovision a +// paid org's basin. export async function reconcileBasinForOrg(orgId: string): Promise { if (!isBillingConfigured()) { return { kind: "skipped", reason: "billing-not-configured" }; } - // Feature flag is the master switch for the whole per-org basin - // pipeline — `provisionBasinForOrg` / `reconfigureBasinForOrg` both - // no-op when it's off. Bail here so the reconcile log lines and - // result kinds reflect reality (no "provisioned (paid upgrade)" log - // for a no-op call), and skip the billing API round-trip entirely. + // Provisioner / reconfigure both no-op when the flag is off. Bail + // here so logs and result kinds reflect that, and skip the billing + // round-trip we couldn't act on anyway. if (!isPerOrgBasinsEnabled()) { return { kind: "skipped", reason: "feature-disabled" }; } @@ -154,9 +117,8 @@ export async function reconcileBasinForOrg(orgId: string): Promise { - // Bring the org's basin state in line with its current plan: - // provision on free→paid, reconfigure retention on tier change, - // null the column on paid→free (the basin itself lingers; old - // streams age out naturally). Idempotent — safe to enqueue - // from setPlan branches and the backfill loop. await reconcileBasinForOrg(payload.orgId); }, }, diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 84bb9054654..1725587df45 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -661,9 +661,6 @@ export class RunEngine { bulkActionGroupIds: bulkActionId ? [bulkActionId] : undefined, planType, realtimeStreamsVersion, - // Stamp the org's S2 basin so realtime reads resolve - // without joining Organization. Null in OSS / pre-backfill; - // read precedence falls back to the global basin env var. streamBasinName, debounce: debounce ? { diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index 251b3c0bd78..0b17262ba1c 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -259,10 +259,6 @@ export type TriggerParams = { bulkActionId?: string; planType?: string; realtimeStreamsVersion?: string; - /// S2 basin where this run's realtime streams live. Stamped onto - /// the new TaskRun row so realtime read paths can resolve the basin - /// without joining `Organization`. Null in OSS / pre-backfill — - /// reads then fall back to the global basin env var. streamBasinName?: string | null; debounce?: { key: string; From 9cb611d0042c4844b381fb5a8b933bda39da1956 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 22:28:05 +0100 Subject: [PATCH 12/13] refactor(webapp): cloud-driven basin sync, drop reconcile worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cloud billing app now drives basin lifecycle operations via a single webapp admin endpoint instead of the webapp enqueuing a reconcile worker that called back into billing to resolve plan + retention. Plan vocabulary now lives in cloud (`Limits.streamBasinRetention`); S2 access stays in the webapp. New: POST /admin/api/v1/orgs/:orgId/stream-basin - { action: "ensure", retention } — provision or PATCH retention - { action: "deprovision" } — null the column Helpers: ensureBasinForOrg / deprovisionBasinForOrg in the provisioner. Removed: - streamBasinRetentionByPlan.server.ts (plan vocabulary) - v3.reconcileStreamBasinForOrg worker job - enqueueStreamBasinReconcile from the three setPlan branches - admin.api.v1.stream-basins.{backfill,reconfigure}.ts (replaced by the per-org sync endpoint) - REALTIME_STREAMS_BASIN_RETENTION_FREE/HOBBY/PRO env vars The .server-changes entry stays accurate — feature behaviour is unchanged from a customer's perspective. --- .server-changes/per-org-stream-basins.md | 2 +- apps/webapp/app/env.server.ts | 5 - ...pi.v1.orgs.$organizationId.stream-basin.ts | 47 ++++++ .../admin.api.v1.stream-basins.backfill.ts | 146 ------------------ .../admin.api.v1.stream-basins.reconfigure.ts | 70 --------- .../webapp/app/services/platform.v3.server.ts | 22 --- .../realtime/streamBasinProvisioner.server.ts | 66 +++++++- .../streamBasinRetentionByPlan.server.ts | 135 ---------------- apps/webapp/app/v3/commonWorker.server.ts | 13 -- 9 files changed, 112 insertions(+), 394 deletions(-) create mode 100644 apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.stream-basin.ts delete mode 100644 apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts delete mode 100644 apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts delete mode 100644 apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts diff --git a/.server-changes/per-org-stream-basins.md b/.server-changes/per-org-stream-basins.md index a97a30d80f3..4e45129849c 100644 --- a/.server-changes/per-org-stream-basins.md +++ b/.server-changes/per-org-stream-basins.md @@ -3,4 +3,4 @@ area: webapp type: feature --- -Per-org S2 stream basins with plan-tied retention (free 7d / hobby 30d / pro 365d), gated by `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED`. Stops basin retention from deleting streams out from under live chat sessions and unlocks per-org cost attribution via S2 basin metrics. +Per-org S2 stream basins with retention tied to the org's billing plan, gated by `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED`. Stops basin retention from deleting streams out from under live chat sessions and unlocks per-org cost attribution. diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 39b5c7804fe..13e9e5dacbd 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1523,11 +1523,6 @@ const EnvironmentSchema = z REALTIME_STREAMS_BASIN_NAME_PREFIX: z.string().default("triggerdotdev"), REALTIME_STREAMS_BASIN_NAME_ENV: z.string().default("dev"), REALTIME_STREAMS_BASIN_DEFAULT_RETENTION: durationString().default("30d"), - // Plan-specific retention overrides consulted by the - // streamBasinRetentionByPlan shim only. - REALTIME_STREAMS_BASIN_RETENTION_FREE: durationString().default("7d"), - REALTIME_STREAMS_BASIN_RETENTION_HOBBY: durationString().default("30d"), - REALTIME_STREAMS_BASIN_RETENTION_PRO: durationString().default("365d"), REALTIME_STREAMS_BASIN_STORAGE_CLASS: z.enum(["express", "standard"]).default("express"), REALTIME_STREAMS_BASIN_DELETE_ON_EMPTY_MIN_AGE: durationString().default("1h"), REALTIME_STREAMS_DEFAULT_VERSION: z.enum(["v1", "v2"]).default("v1"), diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.stream-basin.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.stream-basin.ts new file mode 100644 index 00000000000..67fd457c6b3 --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.stream-basin.ts @@ -0,0 +1,47 @@ +import { ActionFunctionArgs, json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { isValidDuration } from "~/services/realtime/duration.server"; +import { + deprovisionBasinForOrg, + ensureBasinForOrg, +} from "~/services/realtime/streamBasinProvisioner.server"; + +const ParamsSchema = z.object({ organizationId: z.string() }); + +const BodySchema = z.discriminatedUnion("action", [ + z.object({ + action: z.literal("ensure"), + retention: z + .string() + .refine(isValidDuration, "retention must be a duration like 7d, 30d, 365d, 1h, 1y"), + }), + z.object({ action: z.literal("deprovision") }), +]); + +export async function action({ request, params }: ActionFunctionArgs) { + await requireAdminApiRequest(request); + + const { organizationId } = ParamsSchema.parse(params); + + let parsed: z.infer; + try { + const text = await request.text(); + const raw = text.length > 0 ? JSON.parse(text) : {}; + const result = BodySchema.safeParse(raw); + if (!result.success) { + return json({ ok: false, error: result.error.flatten() }, { status: 400 }); + } + parsed = result.data; + } catch { + return json({ ok: false, error: "Invalid JSON body" }, { status: 400 }); + } + + if (parsed.action === "ensure") { + const result = await ensureBasinForOrg(organizationId, parsed.retention); + return json({ ok: true, ...result }); + } + + const result = await deprovisionBasinForOrg(organizationId); + return json({ ok: true, ...result }); +} diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts deleted file mode 100644 index 830307d923a..00000000000 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.backfill.ts +++ /dev/null @@ -1,146 +0,0 @@ -import { json, type ActionFunctionArgs } from "@remix-run/server-runtime"; -import { z } from "zod"; -import { prisma } from "~/db.server"; -import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; -import { isPerOrgBasinsEnabled } from "~/services/realtime/streamBasinProvisioner.server"; -import { commonWorker } from "~/v3/commonWorker.server"; -import { logger } from "~/services/logger.server"; - -/** - * Backfill: enqueue `v3.reconcileStreamBasinForOrg` for every - * non-deleted org. Idempotent. Page through `>limit` orgs by passing - * `afterOrgId` from the previous response's `nextAfterOrgId`. - */ - -const BodySchema = z - .object({ - dryRun: z.boolean().optional().default(false), - limit: z.number().int().min(1).max(10_000).optional().default(1000), - afterOrgId: z.string().optional(), - }) - .strict(); - -type BackfillResponse = { - ok: true; - dryRun: boolean; - enqueued: number; - pending: number; - remaining: number; - orgIds: string[]; - nextAfterOrgId: string | null; -}; - -export async function action({ request }: ActionFunctionArgs) { - await requireAdminApiRequest(request); - - if (!isPerOrgBasinsEnabled()) { - return json( - { - ok: false, - error: - "Per-org stream basins are disabled. Set REALTIME_STREAMS_PER_ORG_BASINS_ENABLED=true before running the backfill.", - }, - { status: 400 } - ); - } - - let parsed: z.infer; - try { - const text = await request.text(); - const raw = text.length > 0 ? JSON.parse(text) : {}; - const result = BodySchema.safeParse(raw); - if (!result.success) { - return json({ ok: false, error: result.error.flatten() }, { status: 400 }); - } - parsed = result.data; - } catch { - return json({ ok: false, error: "Invalid JSON body" }, { status: 400 }); - } - - const { dryRun, limit, afterOrgId } = parsed; - - // Reconcile is fast for the no-op case, so we enqueue for all orgs - // rather than filter on plan here. - const candidates = await prisma.organization.findMany({ - where: { deletedAt: null }, - orderBy: { id: "asc" }, - take: limit, - ...(afterOrgId ? { cursor: { id: afterOrgId }, skip: 1 } : {}), - select: { id: true }, - }); - - const lastReturnedId = candidates[candidates.length - 1]?.id; - const nextAfterOrgId = candidates.length === limit && lastReturnedId ? lastReturnedId : null; - - const remaining = lastReturnedId - ? await prisma.organization.count({ - where: { deletedAt: null, id: { gt: lastReturnedId } }, - }) - : 0; - - if (dryRun) { - const response: BackfillResponse = { - ok: true, - dryRun: true, - enqueued: 0, - pending: candidates.length, - remaining, - orgIds: candidates.map((o) => o.id), - nextAfterOrgId, - }; - return json(response); - } - - let enqueued = 0; - for (const org of candidates) { - try { - await commonWorker.enqueue({ - job: "v3.reconcileStreamBasinForOrg", - payload: { orgId: org.id }, - id: `reconcileStreamBasin:${org.id}`, - }); - enqueued += 1; - } catch (error) { - logger.error("[stream-basins-backfill] enqueue failed", { - orgId: org.id, - error: error instanceof Error ? error.message : String(error), - }); - } - } - - const response: BackfillResponse = { - ok: true, - dryRun: false, - enqueued, - pending: candidates.length, - remaining, - orgIds: candidates.map((o) => o.id), - nextAfterOrgId, - }; - - logger.info("[stream-basins-backfill] enqueued reconcile jobs", { - enqueued, - candidates: candidates.length, - remaining: response.remaining, - }); - - return json(response); -} - -// GET: read-only progress — orgs with vs without a basin stamped. -export async function loader({ request }: ActionFunctionArgs) { - await requireAdminApiRequest(request); - - const totalOrgs = await prisma.organization.count({ where: { deletedAt: null } }); - const withBasin = await prisma.organization.count({ - where: { deletedAt: null, NOT: { streamBasinName: null } }, - }); - - return json({ - ok: true, - perOrgBasinsEnabled: isPerOrgBasinsEnabled(), - totalOrgs, - withBasin, - withoutBasin: totalOrgs - withBasin, - }); -} diff --git a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts b/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts deleted file mode 100644 index 2039a080530..00000000000 --- a/apps/webapp/app/routes/admin.api.v1.stream-basins.reconfigure.ts +++ /dev/null @@ -1,70 +0,0 @@ -import { json, type ActionFunctionArgs } from "@remix-run/server-runtime"; -import { z } from "zod"; -import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; -import { isValidDuration } from "~/services/realtime/duration.server"; -import { - isPerOrgBasinsEnabled, - reconfigureBasinForOrg, -} from "~/services/realtime/streamBasinProvisioner.server"; -import { commonWorker } from "~/v3/commonWorker.server"; - -/** - * Admin route for forcing a basin reconfigure for an org. Two modes: - * - * - `{ orgId }`: enqueues `v3.reconcileStreamBasinForOrg` (the full - * reconciler). May provision, reconfigure, or deprovision based on - * the org's current plan. - * - `{ orgId, retention }`: bypasses the reconciler and PATCHes the - * basin retention inline against the given duration. Doesn't touch - * the column or check the plan. - */ -const BodySchema = z - .object({ - orgId: z.string(), - retention: z - .string() - .refine(isValidDuration, "retention must be a duration like 7d, 30d, 365d, 1h, 1y") - .optional(), - }) - .strict(); - -export async function action({ request }: ActionFunctionArgs) { - await requireAdminApiRequest(request); - - if (!isPerOrgBasinsEnabled()) { - return json( - { ok: false, error: "Per-org stream basins are disabled." }, - { status: 400 } - ); - } - - let parsed: ReturnType; - try { - const text = await request.text(); - const raw = text.length > 0 ? JSON.parse(text) : {}; - parsed = BodySchema.safeParse(raw); - } catch { - return json({ ok: false, error: "Invalid JSON body" }, { status: 400 }); - } - if (!parsed.success) { - return json({ ok: false, error: parsed.error.flatten() }, { status: 400 }); - } - - if (parsed.data.retention) { - await reconfigureBasinForOrg(parsed.data.orgId, parsed.data.retention); - return json({ - ok: true, - mode: "inline", - orgId: parsed.data.orgId, - retention: parsed.data.retention, - }); - } - - await commonWorker.enqueue({ - job: "v3.reconcileStreamBasinForOrg", - payload: { orgId: parsed.data.orgId }, - id: `reconcileStreamBasin:${parsed.data.orgId}`, - }); - - return json({ ok: true, mode: "queued", enqueued: parsed.data.orgId }); -} diff --git a/apps/webapp/app/services/platform.v3.server.ts b/apps/webapp/app/services/platform.v3.server.ts index 265d73b9cb7..6df93c9c0e9 100644 --- a/apps/webapp/app/services/platform.v3.server.ts +++ b/apps/webapp/app/services/platform.v3.server.ts @@ -403,7 +403,6 @@ export async function setPlan( // Invalidate billing cache since plan changed opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); - await enqueueStreamBasinReconcile(organization.id); return redirect(newProjectPath(organization, "You're on the Free plan.")); } else { return redirectWithErrorMessage( @@ -421,38 +420,17 @@ export async function setPlan( // Invalidate billing cache since subscription changed opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); - await enqueueStreamBasinReconcile(organization.id); return redirectWithSuccessMessage(callerPath, request, "Subscription updated successfully."); } case "canceled_subscription": { // Invalidate billing cache since subscription was canceled opts?.invalidateBillingCache?.(organization.id); platformCache.entitlement.remove(organization.id).catch(() => {}); - await enqueueStreamBasinReconcile(organization.id); return redirectWithSuccessMessage(callerPath, request, "Subscription canceled."); } } } -// Best-effort: failures are logged but never block the plan change. -// The reconciler is idempotent and re-reads the plan when it runs, so -// concurrent plan changes collapse to one pending job per org. -async function enqueueStreamBasinReconcile(orgId: string) { - try { - const { commonWorker } = await import("~/v3/commonWorker.server"); - await commonWorker.enqueue({ - job: "v3.reconcileStreamBasinForOrg", - payload: { orgId }, - id: `reconcileStreamBasin:${orgId}`, - }); - } catch (error) { - logger.warn("[setPlan] failed to enqueue stream basin reconcile", { - orgId, - error: error instanceof Error ? error.message : String(error), - }); - } -} - export async function setConcurrencyAddOn(organizationId: string, amount: number) { if (!client) return undefined; diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index 97a7f05b59e..e29aeb168fb 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -3,8 +3,9 @@ * `REALTIME_STREAMS_PER_ORG_BASINS_ENABLED`: when off, all orgs share * `REALTIME_STREAMS_S2_BASIN` and this module no-ops. * - * Pure retention-string in / S2-call out. No plan or billing - * vocabulary — that lives in `streamBasinRetentionByPlan.server.ts`. + * Pure retention-string in / S2-call out. Plan vocabulary lives in the + * cloud billing app, which calls into the admin sync route to drive + * provisioning + reconfiguration. */ import type { PrismaClientOrTransaction } from "~/db.server"; import { prisma } from "~/db.server"; @@ -112,6 +113,67 @@ export async function reconfigureBasinForOrg( }); } +type EnsureResult = + | { kind: "skipped"; reason: "feature-disabled" | "org-not-found" } + | { kind: "provisioned"; basin: string; retention: string } + | { kind: "reconfigured"; basin: string; retention: string }; + +// Idempotent: provisions if the org has no basin, PATCHes retention if +// it does. The single entrypoint the cloud billing app drives — both +// for the live plan-change path and the bulk backfill. +export async function ensureBasinForOrg( + orgId: string, + retention: string +): Promise { + if (!isPerOrgBasinsEnabled()) { + return { kind: "skipped", reason: "feature-disabled" }; + } + + const org = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { id: true, streamBasinName: true }, + }); + if (!org) return { kind: "skipped", reason: "org-not-found" }; + + if (!org.streamBasinName) { + const result = await provisionBasinForOrg( + { id: org.id, streamBasinName: null, retention } + ); + if (result.kind === "provisioned") { + return { kind: "provisioned", basin: result.basin, retention: result.retention }; + } + return { kind: "skipped", reason: "feature-disabled" }; + } + + await reconfigureBasinForOrg(org.id, retention); + return { kind: "reconfigured", basin: org.streamBasinName, retention }; +} + +// Inverse of ensureBasinForOrg: nulls the column so future runs/sessions +// land in the shared global basin. The S2 basin lingers; existing streams +// age out on their original retention. +export async function deprovisionBasinForOrg( + orgId: string +): Promise<{ kind: "deprovisioned" } | { kind: "skipped"; reason: "no-basin" }> { + const org = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { id: true, streamBasinName: true }, + }); + if (!org?.streamBasinName) return { kind: "skipped", reason: "no-basin" }; + + await prisma.organization.update({ + where: { id: org.id }, + data: { streamBasinName: null }, + }); + + logger.info("[streamBasinProvisioner] deprovisioned basin for org", { + orgId, + previousBasin: org.streamBasinName, + }); + + return { kind: "deprovisioned" }; +} + // S2 REST: POST /v1/basins to create, PATCH /v1/basins/{name} to // reconfigure. Wire shape takes integer seconds; we accept human strings // like "7d" / "1y" as env-var ergonomics and parse them here. diff --git a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts b/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts deleted file mode 100644 index 0d08782a65c..00000000000 --- a/apps/webapp/app/services/realtime/streamBasinRetentionByPlan.server.ts +++ /dev/null @@ -1,135 +0,0 @@ -/** - * Plan → basin policy. The only place that knows which plan codes - * earn a dedicated basin and what retention each gets. Operators - * without a billing API never reach this — orgs stay on the shared - * basin via the read-precedence fallback. - */ -import { prisma } from "~/db.server"; -import { env } from "~/env.server"; -import { logger } from "~/services/logger.server"; -import { getCurrentPlan, isBillingConfigured } from "~/services/platform.v3.server"; -import { - defaultRetention, - isPerOrgBasinsEnabled, - provisionBasinForOrg, - reconfigureBasinForOrg, -} from "./streamBasinProvisioner.server"; - -// Adding a plan: drop its code here AND in `retentionForPlanCode`. -// Exact-match against a known set; substring matching could grant the -// wrong tier (e.g. `"approved"` would match `"pro"`). -const PAID_PLAN_CODES = new Set(["v3_hobby_1", "v3_pro_1", "enterprise"]); - -export function isPaidPlanCode(code: string | null | undefined): boolean { - return code != null && PAID_PLAN_CODES.has(code); -} - -export function retentionForPlanCode(code: string | null | undefined): string { - if (!code) return defaultRetention(); - - switch (code) { - case "free": - return env.REALTIME_STREAMS_BASIN_RETENTION_FREE; - case "v3_hobby_1": - return env.REALTIME_STREAMS_BASIN_RETENTION_HOBBY; - case "v3_pro_1": - case "enterprise": - return env.REALTIME_STREAMS_BASIN_RETENTION_PRO; - default: - return defaultRetention(); - } -} - -type ReconcileResult = - | { - kind: "skipped"; - reason: - | "billing-not-configured" - | "feature-disabled" - | "org-not-found" - | "free-no-basin"; - } - | { kind: "provisioned"; retention: string } - | { kind: "reconfigured"; retention: string } - | { kind: "deprovisioned" }; - -// Reconcile an org's basin state with its current plan. Idempotent. -// -// paid + no basin → provision, stamp column. -// paid + has basin → reconfigure retention (in case the tier changed). -// free + has basin → null the column; basin lingers until its streams -// age out on their original retention. -// free + no basin → no-op. -// -// Throws on transient billing failure so redis-worker retries — -// silently defaulting to "free" during an outage would deprovision a -// paid org's basin. -export async function reconcileBasinForOrg(orgId: string): Promise { - if (!isBillingConfigured()) { - return { kind: "skipped", reason: "billing-not-configured" }; - } - - // Provisioner / reconfigure both no-op when the flag is off. Bail - // here so logs and result kinds reflect that, and skip the billing - // round-trip we couldn't act on anyway. - if (!isPerOrgBasinsEnabled()) { - return { kind: "skipped", reason: "feature-disabled" }; - } - - const plan = await getCurrentPlan(orgId); - if (plan === undefined) { - throw new Error( - `[streamBasinReconciler] billing plan unavailable for org ${orgId}; will retry` - ); - } - - const planCode = plan.v3Subscription?.plan?.code; - const paid = isPaidPlanCode(planCode); - - const org = await prisma.organization.findFirst({ - where: { id: orgId }, - select: { id: true, streamBasinName: true }, - }); - if (!org) { - return { kind: "skipped", reason: "org-not-found" }; - } - - if (paid && !org.streamBasinName) { - const retention = retentionForPlanCode(planCode); - await provisionBasinForOrg({ id: org.id, streamBasinName: null, retention }); - logger.info("[streamBasinReconciler] provisioned (paid upgrade)", { - orgId, - planCode, - retention, - }); - return { kind: "provisioned", retention }; - } - - if (paid && org.streamBasinName) { - const retention = retentionForPlanCode(planCode); - await reconfigureBasinForOrg(org.id, retention); - logger.info("[streamBasinReconciler] reconfigured (paid tier change)", { - orgId, - planCode, - retention, - }); - return { kind: "reconfigured", retention }; - } - - if (!paid && org.streamBasinName) { - // Downgrade: unstamp the org so future runs/sessions land in the - // shared basin. Don't touch S2 — old streams age out on their own. - await prisma.organization.update({ - where: { id: org.id }, - data: { streamBasinName: null }, - }); - logger.info("[streamBasinReconciler] deprovisioned (downgrade to free)", { - orgId, - planCode, - previousBasin: org.streamBasinName, - }); - return { kind: "deprovisioned" }; - } - - return { kind: "skipped", reason: "free-no-basin" }; -} diff --git a/apps/webapp/app/v3/commonWorker.server.ts b/apps/webapp/app/v3/commonWorker.server.ts index f5db4f7023a..a2fae9c73ce 100644 --- a/apps/webapp/app/v3/commonWorker.server.ts +++ b/apps/webapp/app/v3/commonWorker.server.ts @@ -21,7 +21,6 @@ import { ResumeTaskDependencyService } from "./services/resumeTaskDependency.ser import { RetryAttemptService } from "./services/retryAttempt.server"; import { TimeoutDeploymentService } from "./services/timeoutDeployment.server"; import { BulkActionService } from "./services/bulk/BulkActionV2.server"; -import { reconcileBasinForOrg } from "~/services/realtime/streamBasinRetentionByPlan.server"; function initializeWorker() { const redisOptions = { @@ -200,15 +199,6 @@ function initializeWorker() { maxAttempts: 5, }, }, - "v3.reconcileStreamBasinForOrg": { - schema: z.object({ - orgId: z.string(), - }), - visibilityTimeoutMs: 60_000, - retry: { - maxAttempts: 5, - }, - }, }, concurrency: { workers: env.COMMON_WORKER_CONCURRENCY_WORKERS, @@ -292,9 +282,6 @@ function initializeWorker() { const service = new BulkActionService(); await service.process(payload.bulkActionId); }, - "v3.reconcileStreamBasinForOrg": async ({ payload }) => { - await reconcileBasinForOrg(payload.orgId); - }, }, }); From 4ce5998c5bace34ad16d7554e01c162ff8f99372 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 4 May 2026 22:59:27 +0100 Subject: [PATCH 13/13] fix(webapp): match writer basin in session-stream wait race-check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the session row is absent (externalId addressing before the row is upserted), the race-check resolved to the legacy basin while the writer side resolves to the org's basin. Fall back to the org so both sides land in the same place — matches the PUT/GET sister routes. --- .../api.v1.runs.$runFriendlyId.session-streams.wait.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index ed1304f4349..4fbdb454d92 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -123,12 +123,13 @@ const { action, loader } = createActionApiRoute( // and remove the pending registration. if (!result.isCached) { try { - // Session streams are hardcoded v2 by the append-side writer - // and SSE subscribe, so the race-check reader matches. Basin - // comes from `session` only — the writer side passes the same - // and we have to read from the same basin to find the record. + // Match the writer's basin resolution exactly: session if the + // row exists, otherwise the org so we look at the same basin a + // fresh row would be stamped with. Mirrors the PUT/GET sister + // routes in `realtime.v1.sessions.$session.$io.ts`. const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2", { session: maybeSession, + organization: maybeSession ? null : authentication.environment.organization, }); if (realtimeStream instanceof S2RealtimeStreams) {