From 445e19f0e94face7e2405f62a3309fda5671a03e Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 30 Jun 2026 18:03:45 +0100 Subject: [PATCH 1/4] fix(webapp,clickhouse): store task output as serialized String in ClickHouse Task run output is stored in a new output_raw String column instead of the native JSON output column. Deeply nested output could push the JSON column's accumulated type past ClickHouse 26.2's input_format_binary_max_type_complexity limit, failing the replication insert so the terminal row never landed and the run appeared stuck. A String has constant binary type complexity regardless of payload shape, so the failure mode is gone for both writes and reads. TRQL path access compiles to JSON_VALUE over output_raw, and bare reads plus full-text search use the String column directly (a new ngram index keeps search fast). The TRQL surface is unchanged. error and the payload column are untouched. --- .server-changes/clickhouse-output-string.md | 6 + .../services/runsReplicationService.server.ts | 24 +++- apps/webapp/app/v3/querySchemas.ts | 8 +- .../test/runsReplicationService.part3.test.ts | 8 +- .../test/runsReplicationService.part6.test.ts | 5 +- .../035_add_task_runs_v2_output_raw.sql | 17 +++ .../clickhouse/src/taskRuns.test.ts | 8 ++ internal-packages/clickhouse/src/taskRuns.ts | 4 + internal-packages/clickhouse/src/tsql.test.ts | 133 ++++++++++++++++++ .../tsql/src/query/printer.test.ts | 97 +++++++++++++ internal-packages/tsql/src/query/printer.ts | 70 +++++++++ internal-packages/tsql/src/query/schema.ts | 22 +++ 12 files changed, 394 insertions(+), 8 deletions(-) create mode 100644 .server-changes/clickhouse-output-string.md create mode 100644 internal-packages/clickhouse/schema/035_add_task_runs_v2_output_raw.sql diff --git a/.server-changes/clickhouse-output-string.md b/.server-changes/clickhouse-output-string.md new file mode 100644 index 00000000000..c42ac8172eb --- /dev/null +++ b/.server-changes/clickhouse-output-string.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Store task run output as serialized JSON text in ClickHouse instead of the native JSON column. Deeply nested output could exceed ClickHouse 26.2's `input_format_binary_max_type_complexity` limit, causing some runs to fail replication and appear stuck. diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts index f76a3c7b83d..9e3beaa422c 100644 --- a/apps/webapp/app/services/runsReplicationService.server.ts +++ b/apps/webapp/app/services/runsReplicationService.server.ts @@ -1061,6 +1061,11 @@ export class RunsReplicationService { _version: bigint ): Promise { const output = await this.#prepareJson(run.output, run.outputType); + // The serialized output is written to the String `output_raw` column rather than the + // native JSON `output` column. A String has constant binary type complexity, so deeply + // nested payloads can no longer breach the type-complexity ceiling and silently drop the + // terminal row on insert. The JSON `output` column is left empty to keep its type trivial. + const outputRaw = serializeJsonRaw(output.data); const errorData = { data: run.error }; // Calculate error fingerprint for failed runs @@ -1100,7 +1105,7 @@ export class RunsReplicationService { run.usageDurationMs ?? 0, // usage_duration_ms run.costInCents ?? 0, // cost_in_cents run.baseCostInCents ?? 0, // base_cost_in_cents - output, // output + { data: undefined }, // output (left empty; serialized value lives in output_raw) errorData, // error errorFingerprint, // error_fingerprint run.runTags ?? [], // tags @@ -1130,6 +1135,7 @@ export class RunsReplicationService { annotations?.rootTriggerSource ?? "", // root_trigger_source annotations?.taskKind ?? "", // task_kind run.isWarmStart ?? null, // is_warm_start + outputRaw, // output_raw ]; } @@ -1369,3 +1375,19 @@ function lsnToUInt64(lsn: string): bigint { const [seg, off] = lsn.split("/"); return (BigInt("0x" + seg) << 32n) | BigInt("0x" + off); } + +/** + * Serialize an already-parsed JSON value to the text stored in a `*_raw` String column. + * Returns an empty string when there is no value, which the query layer treats as "no data". + */ +function serializeJsonRaw(data: unknown): string { + if (data === undefined) { + return ""; + } + + try { + return JSON.stringify(data) ?? ""; + } catch { + return ""; + } +} diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index 4784ad75629..5fac0032830 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -353,9 +353,11 @@ export const runsSchema: TableSchema = { description: "The data you returned from the task.", example: '{"result": "success"}', }), - nullValue: "'{}'", // Transform NULL checks to compare against empty object - textColumn: "output_text", // Use output_text for full JSON value queries - dataPrefix: "data", // Internal data is wrapped in {"data": ...} + // Stored as serialized JSON text in output_raw (a String) rather than the native JSON + // column, so reads/writes can't hit the JSON binary type-complexity ceiling. + nullValue: "''", // Empty raw string means "no output" + textColumn: "output_raw", // Full-value reads/search use the raw String column + rawColumn: "output_raw", // Path access (output.foo) compiles to JSON_VALUE(output_raw, '$.foo') }, error: { name: "error", diff --git a/apps/webapp/test/runsReplicationService.part3.test.ts b/apps/webapp/test/runsReplicationService.part3.test.ts index 1261be3b513..4e4d60cb3d7 100644 --- a/apps/webapp/test/runsReplicationService.part3.test.ts +++ b/apps/webapp/test/runsReplicationService.part3.test.ts @@ -168,13 +168,17 @@ describe("RunsReplicationService (part 3/7)", () => { status: "COMPLETED_SUCCESSFULLY", }) ); - expect(found?.output).toBeDefined(); + // Output is stored as serialized text in output_raw; the native JSON column stays empty + expect(found?.output).toStrictEqual({}); + expect(found?.output_raw).toBe(`{"foo":"bar"}`); } - // Check the run with the bad JSON + // The run with the bad JSON lands with its output blanked (output_raw empty) rather than + // being dropped, so its terminal status is still recorded. const foundBad = result?.find((r: any) => r.span_id === "bulk-10"); expect(foundBad).toBeDefined(); expect(foundBad?.output).toStrictEqual({}); + expect(foundBad?.output_raw).toBe(""); await runsReplicationService.stop(); } diff --git a/apps/webapp/test/runsReplicationService.part6.test.ts b/apps/webapp/test/runsReplicationService.part6.test.ts index 276920f8491..30db9e3a107 100644 --- a/apps/webapp/test/runsReplicationService.part6.test.ts +++ b/apps/webapp/test/runsReplicationService.part6.test.ts @@ -473,8 +473,9 @@ describe("RunsReplicationService (part 6/7)", () => { expect(parseClickhouseTimestamp(clickhouseRun.queued_at)).toBe(queuedAt.getTime()); expect(parseClickhouseTimestamp(clickhouseRun.expired_at)).toBeNull(); - // Output (parsed JSON) - expect(clickhouseRun.output).toEqual({ data: { result: "test-output" } }); + // Output is stored as serialized JSON text in output_raw; the native JSON column is empty + expect(clickhouseRun.output).toEqual({}); + expect(clickhouseRun.output_raw).toEqual(JSON.stringify({ result: "test-output" })); // Error expect(clickhouseRun.error).toEqual({ diff --git a/internal-packages/clickhouse/schema/035_add_task_runs_v2_output_raw.sql b/internal-packages/clickhouse/schema/035_add_task_runs_v2_output_raw.sql new file mode 100644 index 00000000000..b16a04aa530 --- /dev/null +++ b/internal-packages/clickhouse/schema/035_add_task_runs_v2_output_raw.sql @@ -0,0 +1,17 @@ +-- +goose Up +-- Store task output as a serialized JSON String alongside the native JSON `output` column. +-- A String has constant binary type complexity regardless of payload depth/width, so writes +-- and reads can never hit input_format_binary_max_type_complexity the way the JSON type can. +ALTER TABLE trigger_dev.task_runs_v2 + ADD COLUMN IF NOT EXISTS output_raw String DEFAULT ''; + +-- Keep full-text search on output fast now that reads come from output_raw instead of output_text. +ALTER TABLE trigger_dev.task_runs_v2 + ADD INDEX IF NOT EXISTS idx_output_raw output_raw TYPE ngrambf_v1 (3, 131072, 3, 0) GRANULARITY 4; + +-- +goose Down +ALTER TABLE trigger_dev.task_runs_v2 + DROP INDEX IF EXISTS idx_output_raw; + +ALTER TABLE trigger_dev.task_runs_v2 + DROP COLUMN IF EXISTS output_raw; diff --git a/internal-packages/clickhouse/src/taskRuns.test.ts b/internal-packages/clickhouse/src/taskRuns.test.ts index 0d4ec995c24..62ea97cd26c 100644 --- a/internal-packages/clickhouse/src/taskRuns.test.ts +++ b/internal-packages/clickhouse/src/taskRuns.test.ts @@ -90,6 +90,7 @@ describe("Task Runs V2", () => { "", // root_trigger_source "", // task_kind null, // is_warm_start + "", // output_raw ]; const [insertError, insertResult] = await insert([taskRunData]); @@ -224,6 +225,7 @@ describe("Task Runs V2", () => { "", // root_trigger_source "", // task_kind null, // is_warm_start + "", // output_raw ]; const run2: TaskRunInsertArray = [ @@ -281,6 +283,7 @@ describe("Task Runs V2", () => { "", // root_trigger_source "", // task_kind null, // is_warm_start + "", // output_raw ]; const [insertError, insertResult] = await insert([run1, run2]); @@ -385,6 +388,7 @@ describe("Task Runs V2", () => { "", // root_trigger_source "", // task_kind null, // is_warm_start + "", // output_raw ]; const [insertError, insertResult] = await insert([taskRun]); @@ -497,6 +501,7 @@ describe("Task Runs V2", () => { "", // root_trigger_source "", // task_kind null, // is_warm_start + "", // output_raw ]; const childA_v1: TaskRunInsertArray = [ @@ -554,6 +559,7 @@ describe("Task Runs V2", () => { "", "", null, + "", // output_raw ]; const childA_v2: TaskRunInsertArray = [...childA_v1]; @@ -615,6 +621,7 @@ describe("Task Runs V2", () => { "", "", null, + "", // output_raw ]; const childDeleted_v1: TaskRunInsertArray = [ @@ -672,6 +679,7 @@ describe("Task Runs V2", () => { "", "", null, + "", // output_raw ]; const childDeleted_v2: TaskRunInsertArray = [...childDeleted_v1]; diff --git a/internal-packages/clickhouse/src/taskRuns.ts b/internal-packages/clickhouse/src/taskRuns.ts index 67dd0371f14..9e275a7b24f 100644 --- a/internal-packages/clickhouse/src/taskRuns.ts +++ b/internal-packages/clickhouse/src/taskRuns.ts @@ -28,6 +28,7 @@ export const TaskRunV2 = z.object({ cost_in_cents: z.number().default(0), base_cost_in_cents: z.number().default(0), output: z.unknown(), + output_raw: z.string().default(""), error: z.unknown(), error_fingerprint: z.string().default(""), tags: z.array(z.string()).default([]), @@ -117,6 +118,7 @@ export const TASK_RUN_COLUMNS = [ "root_trigger_source", "task_kind", "is_warm_start", + "output_raw", ] as const; export type TaskRunColumnName = (typeof TASK_RUN_COLUMNS)[number]; @@ -186,6 +188,7 @@ export type TaskRunFieldTypes = { root_trigger_source: string; task_kind: string; is_warm_start: boolean | null; + output_raw: string; }; /** @@ -326,6 +329,7 @@ export type TaskRunInsertArray = [ root_trigger_source: string, task_kind: string, is_warm_start: boolean | null, + output_raw: string, ]; /** diff --git a/internal-packages/clickhouse/src/tsql.test.ts b/internal-packages/clickhouse/src/tsql.test.ts index cb30d5e85c1..c9224e07265 100644 --- a/internal-packages/clickhouse/src/tsql.test.ts +++ b/internal-packages/clickhouse/src/tsql.test.ts @@ -1608,4 +1608,137 @@ describe("Field Mapping Tests", () => { expect(result?.rows).toHaveLength(2); expect(result?.rows?.map((r) => r.run_id).sort()).toEqual(["run_fm_in1", "run_fm_in2"]); }); + + describe("output stored as serialized text (rawColumn bridge)", () => { + // `output` is a JSON column physically stored as serialized JSON text in the String + // column output_raw. Path access bridges to JSON_VALUE; bare access reads the String. + const outputSchema: TableSchema = { + ...taskRunsSchema, + columns: { + ...taskRunsSchema.columns, + output: { + name: "output", + ...column("JSON"), + nullValue: "''", + textColumn: "output_raw", + rawColumn: "output_raw", + }, + }, + }; + + const tenant = { + organization_id: { op: "eq", value: "org_tenant1" }, + project_id: { op: "eq", value: "proj_tenant1" }, + environment_id: { op: "eq", value: "env_tenant1" }, + } as const; + + clickhouseTest( + "filters and selects JSON paths via JSON_VALUE over output_raw", + async ({ clickhouseContainer }) => { + const client = new ClickhouseClient({ + name: "test", + url: clickhouseContainer.getConnectionUrl(), + }); + + const insert = insertTaskRuns(client, { async_insert: 0 }); + const [insertError] = await insert([ + createTaskRun({ run_id: "run_o1", output_raw: JSON.stringify({ foo: "bar", n: 42 }) }), + createTaskRun({ run_id: "run_o2", output_raw: JSON.stringify({ foo: "baz" }) }), + createTaskRun({ run_id: "run_o3", output_raw: "" }), + ]); + expect(insertError).toBeNull(); + + // Path access in SELECT + WHERE both compile to JSON_VALUE(output_raw, '$.foo') + const [error, result] = await executeTSQL(client, { + name: "test-output-path", + query: "SELECT run_id, output.foo AS foo FROM task_runs WHERE output.foo = 'bar'", + schema: z.object({ run_id: z.string(), foo: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + + expect(error).toBeNull(); + expect(result?.rows).toEqual([{ run_id: "run_o1", foo: "bar" }]); + + // A nested scalar path works too, returning the value as a string + const [nError, nResult] = await executeTSQL(client, { + name: "test-output-path-number", + query: "SELECT output.n AS n FROM task_runs WHERE run_id = 'run_o1'", + schema: z.object({ n: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(nError).toBeNull(); + expect(nResult?.rows).toEqual([{ n: "42" }]); + } + ); + + clickhouseTest("reads the bare value from output_raw", async ({ clickhouseContainer }) => { + const client = new ClickhouseClient({ + name: "test", + url: clickhouseContainer.getConnectionUrl(), + }); + + const insert = insertTaskRuns(client, { async_insert: 0 }); + const [insertError] = await insert([ + createTaskRun({ run_id: "run_b1", output_raw: JSON.stringify({ foo: "baz" }) }), + createTaskRun({ run_id: "run_b2", output_raw: "" }), + ]); + expect(insertError).toBeNull(); + + // Bare selection reads the raw String column directly + const [error, result] = await executeTSQL(client, { + name: "test-output-bare", + query: "SELECT run_id, output FROM task_runs WHERE run_id = 'run_b1'", + schema: z.object({ run_id: z.string(), output: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(error).toBeNull(); + expect(result?.rows).toEqual([{ run_id: "run_b1", output: JSON.stringify({ foo: "baz" }) }]); + + // An empty output_raw is treated as "no output" by IS NULL + const [nullError, nullResult] = await executeTSQL(client, { + name: "test-output-is-null", + query: "SELECT run_id FROM task_runs WHERE output IS NULL", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(nullError).toBeNull(); + expect(nullResult?.rows).toEqual([{ run_id: "run_b2" }]); + }); + + clickhouseTest( + "full-text search matches against output_raw", + async ({ clickhouseContainer }) => { + const client = new ClickhouseClient({ + name: "test", + url: clickhouseContainer.getConnectionUrl(), + }); + + const insert = insertTaskRuns(client, { async_insert: 0 }); + const [insertError] = await insert([ + createTaskRun({ + run_id: "run_s1", + output_raw: JSON.stringify({ message: "boom happened here" }), + }), + createTaskRun({ run_id: "run_s2", output_raw: JSON.stringify({ message: "all good" }) }), + createTaskRun({ run_id: "run_s3", output_raw: "" }), + ]); + expect(insertError).toBeNull(); + + // Bare LIKE compiles to like(output_raw, ...) and matches the serialized text + const [error, result] = await executeTSQL(client, { + name: "test-output-like", + query: "SELECT run_id FROM task_runs WHERE output LIKE '%boom%'", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(error).toBeNull(); + expect(result?.rows).toEqual([{ run_id: "run_s1" }]); + } + ); + }); }); diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index 0aa5b816064..292f446f769 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -941,6 +941,103 @@ describe("ClickHousePrinter", () => { }); }); + describe("rawColumn bridge for String-backed JSON columns", () => { + // A JSON column stored as serialized text in a String column (output_raw). Path access + // compiles to JSON_VALUE over the String column; bare access/search use it as a text column. + const rawColumnSchema: TableSchema = { + name: "runs", + clickhouseName: "trigger_dev.task_runs_v2", + columns: { + id: { name: "id", ...column("String") }, + output: { + name: "output", + ...column("JSON"), + nullValue: "''", + textColumn: "output_raw", + rawColumn: "output_raw", + }, + status: { name: "status", ...column("String") }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + function createRawColumnContext() { + const schema = createSchemaRegistry([rawColumnSchema]); + return createPrinterContext({ + schema, + enforcedWhereClause: { + organization_id: { op: "eq", value: "org_test" }, + project_id: { op: "eq", value: "proj_test" }, + environment_id: { op: "eq", value: "env_test" }, + }, + }); + } + + it("uses the raw String column for bare selection", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT output FROM runs", ctx); + + expect(sql).toContain("output_raw AS output"); + expect(sql).not.toContain("JSON_VALUE"); + }); + + it("compiles single-level path access to JSON_VALUE", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT output.name FROM runs", ctx); + + expect(sql).toContain("JSON_VALUE(output_raw, '$.name') AS output_name"); + expect(sql).not.toContain(".:String"); + }); + + it("compiles nested path access to JSON_VALUE", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT output.data.name FROM runs", ctx); + + expect(sql).toContain("JSON_VALUE(output_raw, '$.data.name') AS output_data_name"); + }); + + it("compiles path access in WHERE to JSON_VALUE", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.name = 'test'", ctx); + + expect(sql).toContain("equals(JSON_VALUE(output_raw, '$.name'),"); + }); + + it("uses the raw String column for bare LIKE search", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output LIKE '%boom%'", ctx); + + expect(sql).toContain("like(output_raw,"); + expect(sql).not.toContain("JSON_VALUE"); + }); + + it("uses the empty-string nullValue for IS NULL on the raw String column", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output IS NULL", ctx); + + expect(sql).toContain("equals(output_raw, '')"); + }); + + it("produces matching JSON_VALUE in SELECT and GROUP BY", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery( + "SELECT output.status, count() AS c FROM runs GROUP BY output.status", + ctx + ); + + expect(sql).toContain("JSON_VALUE(output_raw, '$.status') AS output_status"); + expect(sql).toContain("GROUP BY JSON_VALUE(output_raw, '$.status')"); + expect(sql).not.toContain(".:String"); + }); + }); + describe("dataPrefix for JSON columns", () => { // Create a schema with JSON columns that have dataPrefix set const dataPrefixSchema: TableSchema = { diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index a4b50ca8540..e6049191d9b 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -2366,6 +2366,13 @@ export class ClickHousePrinter { return `(${virtualExpression})`; } + // JSON-path access on a String-backed JSON column compiles to a JSON_VALUE bridge over the + // raw String column instead of native JSON sub-column access. + const rawColumnAccess = this.getRawColumnAccessForField(node.chain); + if (rawColumnAccess !== null) { + return rawColumnAccess; + } + // Inject dataPrefix for JSON columns if needed (e.g., output.message -> output.data.message) const chainWithPrefix = this.injectDataPrefix(node.chain); @@ -2607,6 +2614,69 @@ export class ClickHousePrinter { return parts.join("_"); } + /** + * If a field chain is JSON-path access on a column backed by a `rawColumn` (a String holding + * serialized JSON), build the `JSON_VALUE` bridge expression. Returns null otherwise. + * + * e.g. `output.foo` -> `JSON_VALUE(output_raw, '$.foo')` + * `r.output.a.b` -> `JSON_VALUE(r.output_raw, '$.a.b')` + * + * The path is inlined as an escaped string literal (rather than a query parameter) so that + * the same access produces byte-identical SQL in SELECT and GROUP BY, which ClickHouse + * requires for the expressions to match. + */ + private getRawColumnAccessForField(chain: Array): string | null { + if (chain.length < 2) return null; + + const firstPart = chain[0]; + if (typeof firstPart !== "string") return null; + + let columnSchema: ColumnSchema | null; + let rawColumnExpr: string; + let pathParts: Array; + + const tableSchema = this.tableContexts.get(firstPart); + if (tableSchema) { + // Qualified: table.column.subfield... (needs at least 3 parts to be path access) + if (chain.length < 3) return null; + const columnName = chain[1]; + if (typeof columnName !== "string") return null; + columnSchema = tableSchema.columns[columnName] ?? null; + if (!columnSchema?.rawColumn) return null; + rawColumnExpr = `${this.printIdentifier(firstPart)}.${this.printIdentifier( + columnSchema.rawColumn + )}`; + pathParts = chain.slice(2); + } else { + // Unqualified: column.subfield... + columnSchema = this.resolveFieldToColumnSchema([firstPart]); + if (!columnSchema?.rawColumn) return null; + rawColumnExpr = this.printIdentifier(columnSchema.rawColumn); + pathParts = chain.slice(1); + } + + if (pathParts.length === 0) return null; + + const jsonPath = this.buildJsonPath(pathParts); + return `JSON_VALUE(${rawColumnExpr}, ${escapeClickHouseString(jsonPath)})`; + } + + /** + * Build a ClickHouse JSON path (e.g. `$.a.b[0]`) from a field chain's path parts. + * String parts become `.key`; numeric parts become `[index]`. + */ + private buildJsonPath(parts: Array): string { + let path = "$"; + for (const part of parts) { + if (typeof part === "number") { + path += `[${part}]`; + } else { + path += `.${part}`; + } + } + return path; + } + /** * Resolve a field chain to its column schema (if it references a known column) */ diff --git a/internal-packages/tsql/src/query/schema.ts b/internal-packages/tsql/src/query/schema.ts index 9a1e2d2ddfe..ab609614170 100644 --- a/internal-packages/tsql/src/query/schema.ts +++ b/internal-packages/tsql/src/query/schema.ts @@ -254,6 +254,28 @@ export interface ColumnSchema { * ``` */ dataPrefix?: string; + /** + * String column holding the serialized JSON text for this column. + * + * When set, JSON-path access (e.g. `output.foo`) compiles to a `JSON_VALUE` call over this + * String column instead of native JSON sub-column access. This lets a column keep its JSON + * query surface while being physically stored as a `String`, which sidesteps the native JSON + * type's binary type-complexity ceiling. The path surface (`output.foo`) is unchanged for + * callers — only the generated SQL differs. + * + * `JSON_VALUE` returns scalar leaves as strings (and an empty string for missing keys or + * object/array subtrees), matching how scalar path access is used in filters and display. + * + * @example + * ```typescript + * { + * name: "output", + * type: "JSON", + * rawColumn: "output_raw", // output.foo → JSON_VALUE(output_raw, '$.foo') + * } + * ``` + */ + rawColumn?: string; } /** From f45b3e33894c0a552fc3acd96fd9bdc3e7151b47 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 30 Jun 2026 18:40:56 +0100 Subject: [PATCH 2/4] feat(tsql): support object and array subtrees in output_raw path access The rawColumn bridge now compiles JSON path access to a JSONExtract expression that returns string scalars unquoted (so equality, LIKE and display match native scalar access) while returning object and array subtrees as raw JSON text. Missing keys yield an empty string. --- apps/webapp/app/v3/querySchemas.ts | 2 +- internal-packages/clickhouse/src/tsql.test.ts | 24 ++++++++-- .../tsql/src/query/printer.test.ts | 33 +++++++++----- internal-packages/tsql/src/query/printer.ts | 44 ++++++++++--------- internal-packages/tsql/src/query/schema.ts | 17 +++---- 5 files changed, 75 insertions(+), 45 deletions(-) diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index 5fac0032830..76f549099ae 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -357,7 +357,7 @@ export const runsSchema: TableSchema = { // column, so reads/writes can't hit the JSON binary type-complexity ceiling. nullValue: "''", // Empty raw string means "no output" textColumn: "output_raw", // Full-value reads/search use the raw String column - rawColumn: "output_raw", // Path access (output.foo) compiles to JSON_VALUE(output_raw, '$.foo') + rawColumn: "output_raw", // Path access (output.foo) compiles to a JSONExtract bridge over output_raw }, error: { name: "error", diff --git a/internal-packages/clickhouse/src/tsql.test.ts b/internal-packages/clickhouse/src/tsql.test.ts index c9224e07265..a13b6134a6c 100644 --- a/internal-packages/clickhouse/src/tsql.test.ts +++ b/internal-packages/clickhouse/src/tsql.test.ts @@ -1611,7 +1611,8 @@ describe("Field Mapping Tests", () => { describe("output stored as serialized text (rawColumn bridge)", () => { // `output` is a JSON column physically stored as serialized JSON text in the String - // column output_raw. Path access bridges to JSON_VALUE; bare access reads the String. + // column output_raw. Path access bridges to a JSONExtract expression (unquoted scalars, + // raw JSON for object/array subtrees); bare access reads the String directly. const outputSchema: TableSchema = { ...taskRunsSchema, columns: { @@ -1633,7 +1634,7 @@ describe("Field Mapping Tests", () => { } as const; clickhouseTest( - "filters and selects JSON paths via JSON_VALUE over output_raw", + "filters and selects JSON paths over output_raw", async ({ clickhouseContainer }) => { const client = new ClickhouseClient({ name: "test", @@ -1642,13 +1643,16 @@ describe("Field Mapping Tests", () => { const insert = insertTaskRuns(client, { async_insert: 0 }); const [insertError] = await insert([ - createTaskRun({ run_id: "run_o1", output_raw: JSON.stringify({ foo: "bar", n: 42 }) }), + createTaskRun({ + run_id: "run_o1", + output_raw: JSON.stringify({ foo: "bar", n: 42, nested: { id: 7 }, list: [1, 2] }), + }), createTaskRun({ run_id: "run_o2", output_raw: JSON.stringify({ foo: "baz" }) }), createTaskRun({ run_id: "run_o3", output_raw: "" }), ]); expect(insertError).toBeNull(); - // Path access in SELECT + WHERE both compile to JSON_VALUE(output_raw, '$.foo') + // Scalar path access in SELECT + WHERE returns the value unquoted const [error, result] = await executeTSQL(client, { name: "test-output-path", query: "SELECT run_id, output.foo AS foo FROM task_runs WHERE output.foo = 'bar'", @@ -1670,6 +1674,18 @@ describe("Field Mapping Tests", () => { }); expect(nError).toBeNull(); expect(nResult?.rows).toEqual([{ n: "42" }]); + + // Object and array subtrees come back as raw JSON text, not an empty string + const [subError, subResult] = await executeTSQL(client, { + name: "test-output-path-subtree", + query: + "SELECT output.nested AS nested, output.list AS list FROM task_runs WHERE run_id = 'run_o1'", + schema: z.object({ nested: z.string(), list: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(subError).toBeNull(); + expect(subResult?.rows).toEqual([{ nested: JSON.stringify({ id: 7 }), list: "[1,2]" }]); } ); diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index 292f446f769..cb152456d4b 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -980,34 +980,41 @@ describe("ClickHousePrinter", () => { }); } + // The bridge expression for a single key: unquoted scalar strings, raw JSON for subtrees, + // '' for missing keys. + const nameBridge = + "if(JSONType(output_raw, 'name') = 'String', " + + "JSONExtractString(output_raw, 'name'), JSONExtractRaw(output_raw, 'name'))"; + it("uses the raw String column for bare selection", () => { const ctx = createRawColumnContext(); const { sql } = printQuery("SELECT output FROM runs", ctx); expect(sql).toContain("output_raw AS output"); - expect(sql).not.toContain("JSON_VALUE"); + expect(sql).not.toContain("JSONExtract"); }); - it("compiles single-level path access to JSON_VALUE", () => { + it("compiles single-level path access to a JSONExtract bridge", () => { const ctx = createRawColumnContext(); const { sql } = printQuery("SELECT output.name FROM runs", ctx); - expect(sql).toContain("JSON_VALUE(output_raw, '$.name') AS output_name"); + expect(sql).toContain(`${nameBridge} AS output_name`); expect(sql).not.toContain(".:String"); }); - it("compiles nested path access to JSON_VALUE", () => { + it("compiles nested path access to a JSONExtract bridge with multiple keys", () => { const ctx = createRawColumnContext(); const { sql } = printQuery("SELECT output.data.name FROM runs", ctx); - expect(sql).toContain("JSON_VALUE(output_raw, '$.data.name') AS output_data_name"); + expect(sql).toContain("JSONExtractString(output_raw, 'data', 'name')"); + expect(sql).toContain("AS output_data_name"); }); - it("compiles path access in WHERE to JSON_VALUE", () => { + it("compiles path access in WHERE to a JSONExtract bridge", () => { const ctx = createRawColumnContext(); const { sql } = printQuery("SELECT id FROM runs WHERE output.name = 'test'", ctx); - expect(sql).toContain("equals(JSON_VALUE(output_raw, '$.name'),"); + expect(sql).toContain(`equals(${nameBridge},`); }); it("uses the raw String column for bare LIKE search", () => { @@ -1015,7 +1022,7 @@ describe("ClickHousePrinter", () => { const { sql } = printQuery("SELECT id FROM runs WHERE output LIKE '%boom%'", ctx); expect(sql).toContain("like(output_raw,"); - expect(sql).not.toContain("JSON_VALUE"); + expect(sql).not.toContain("JSONExtract"); }); it("uses the empty-string nullValue for IS NULL on the raw String column", () => { @@ -1025,15 +1032,19 @@ describe("ClickHousePrinter", () => { expect(sql).toContain("equals(output_raw, '')"); }); - it("produces matching JSON_VALUE in SELECT and GROUP BY", () => { + it("produces a matching bridge expression in SELECT and GROUP BY", () => { const ctx = createRawColumnContext(); const { sql } = printQuery( "SELECT output.status, count() AS c FROM runs GROUP BY output.status", ctx ); - expect(sql).toContain("JSON_VALUE(output_raw, '$.status') AS output_status"); - expect(sql).toContain("GROUP BY JSON_VALUE(output_raw, '$.status')"); + const statusBridge = + "if(JSONType(output_raw, 'status') = 'String', " + + "JSONExtractString(output_raw, 'status'), JSONExtractRaw(output_raw, 'status'))"; + + expect(sql).toContain(`${statusBridge} AS output_status`); + expect(sql).toContain(`GROUP BY ${statusBridge}`); expect(sql).not.toContain(".:String"); }); }); diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index e6049191d9b..343ac3c291f 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -2366,7 +2366,7 @@ export class ClickHousePrinter { return `(${virtualExpression})`; } - // JSON-path access on a String-backed JSON column compiles to a JSON_VALUE bridge over the + // JSON-path access on a String-backed JSON column compiles to a JSONExtract bridge over the // raw String column instead of native JSON sub-column access. const rawColumnAccess = this.getRawColumnAccessForField(node.chain); if (rawColumnAccess !== null) { @@ -2616,14 +2616,16 @@ export class ClickHousePrinter { /** * If a field chain is JSON-path access on a column backed by a `rawColumn` (a String holding - * serialized JSON), build the `JSON_VALUE` bridge expression. Returns null otherwise. + * serialized JSON), build the JSONExtract bridge expression. Returns null otherwise. * - * e.g. `output.foo` -> `JSON_VALUE(output_raw, '$.foo')` - * `r.output.a.b` -> `JSON_VALUE(r.output_raw, '$.a.b')` + * e.g. `output.foo` -> JSONExtract over output_raw with key 'foo' + * `r.output.a.b` -> JSONExtract over r.output_raw with keys 'a', 'b' * - * The path is inlined as an escaped string literal (rather than a query parameter) so that - * the same access produces byte-identical SQL in SELECT and GROUP BY, which ClickHouse - * requires for the expressions to match. + * The bridge returns string scalars unquoted (so `=`, LIKE and display behave like native + * scalar access) while still returning object/array subtrees as raw JSON text. Missing keys + * yield an empty string. Keys are inlined as escaped literals (not query parameters) so the + * same access produces byte-identical SQL in SELECT and GROUP BY, which ClickHouse requires + * for the expressions to match. */ private getRawColumnAccessForField(chain: Array): string | null { if (chain.length < 2) return null; @@ -2657,24 +2659,24 @@ export class ClickHousePrinter { if (pathParts.length === 0) return null; - const jsonPath = this.buildJsonPath(pathParts); - return `JSON_VALUE(${rawColumnExpr}, ${escapeClickHouseString(jsonPath)})`; + const keyArgs = this.buildJsonExtractKeyArgs(pathParts); + const type = `JSONType(${rawColumnExpr}, ${keyArgs})`; + const string = `JSONExtractString(${rawColumnExpr}, ${keyArgs})`; + const raw = `JSONExtractRaw(${rawColumnExpr}, ${keyArgs})`; + // String leaf -> unquoted value (keeps =/LIKE/display faithful). Everything else (number, + // bool, object, array, null) -> raw JSON text, which preserves subtrees. A missing key is + // not a String either, so it falls to JSONExtractRaw and yields an empty string. + return `if(${type} = 'String', ${string}, ${raw})`; } /** - * Build a ClickHouse JSON path (e.g. `$.a.b[0]`) from a field chain's path parts. - * String parts become `.key`; numeric parts become `[index]`. + * Build the comma-separated key arguments for a JSONExtract* call from a field chain's path + * parts. String parts become escaped string-literal keys; numeric parts become array indices. */ - private buildJsonPath(parts: Array): string { - let path = "$"; - for (const part of parts) { - if (typeof part === "number") { - path += `[${part}]`; - } else { - path += `.${part}`; - } - } - return path; + private buildJsonExtractKeyArgs(parts: Array): string { + return parts + .map((part) => (typeof part === "number" ? String(part) : escapeClickHouseString(part))) + .join(", "); } /** diff --git a/internal-packages/tsql/src/query/schema.ts b/internal-packages/tsql/src/query/schema.ts index ab609614170..136497decb2 100644 --- a/internal-packages/tsql/src/query/schema.ts +++ b/internal-packages/tsql/src/query/schema.ts @@ -257,21 +257,22 @@ export interface ColumnSchema { /** * String column holding the serialized JSON text for this column. * - * When set, JSON-path access (e.g. `output.foo`) compiles to a `JSON_VALUE` call over this - * String column instead of native JSON sub-column access. This lets a column keep its JSON - * query surface while being physically stored as a `String`, which sidesteps the native JSON - * type's binary type-complexity ceiling. The path surface (`output.foo`) is unchanged for - * callers — only the generated SQL differs. + * When set, JSON-path access (e.g. `output.foo`) compiles to a `JSONExtract` expression over + * this String column instead of native JSON sub-column access. This lets a column keep its + * JSON query surface while being physically stored as a `String`, which sidesteps the native + * JSON type's binary type-complexity ceiling. The path surface (`output.foo`) is unchanged + * for callers — only the generated SQL differs. * - * `JSON_VALUE` returns scalar leaves as strings (and an empty string for missing keys or - * object/array subtrees), matching how scalar path access is used in filters and display. + * Scalar string leaves are returned unquoted (so `=`, LIKE and display behave like native + * scalar access), object/array subtrees are returned as raw JSON text, and missing keys + * yield an empty string. * * @example * ```typescript * { * name: "output", * type: "JSON", - * rawColumn: "output_raw", // output.foo → JSON_VALUE(output_raw, '$.foo') + * rawColumn: "output_raw", // output.foo → JSONExtract bridge over output_raw * } * ``` */ From ff789cfdde026a37bdd88d7b814449fbc3ff71e3 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 30 Jun 2026 18:54:22 +0100 Subject: [PATCH 3/4] feat(tsql): numeric and boolean comparisons on output_raw paths When a JSON path on a String-backed column is compared against a numeric or boolean literal, extract the path as that type (JSONExtractFloat / JSONExtractBool) so the comparison is numeric/boolean with correct equality and ordering, rather than a string comparison that would either error on type mismatch or sort lexically. String literals and the LIKE family keep the string bridge. --- internal-packages/clickhouse/src/tsql.test.ts | 80 +++++++++++++++ .../tsql/src/query/printer.test.ts | 37 +++++++ internal-packages/tsql/src/query/printer.ts | 98 +++++++++++++++++-- 3 files changed, 208 insertions(+), 7 deletions(-) diff --git a/internal-packages/clickhouse/src/tsql.test.ts b/internal-packages/clickhouse/src/tsql.test.ts index a13b6134a6c..a7d3141eaca 100644 --- a/internal-packages/clickhouse/src/tsql.test.ts +++ b/internal-packages/clickhouse/src/tsql.test.ts @@ -1689,6 +1689,86 @@ describe("Field Mapping Tests", () => { } ); + clickhouseTest( + "exercises path access across scalar types in SELECT and WHERE", + async ({ clickhouseContainer }) => { + const client = new ClickhouseClient({ + name: "test", + url: clickhouseContainer.getConnectionUrl(), + }); + + const insert = insertTaskRuns(client, { async_insert: 0 }); + const [insertError] = await insert([ + createTaskRun({ + run_id: "run_t1", + output_raw: JSON.stringify({ str: "hello", num: 42, flag: true }), + }), + createTaskRun({ + run_id: "run_t2", + output_raw: JSON.stringify({ str: "world", num: 9, flag: false }), + }), + ]); + expect(insertError).toBeNull(); + + // SELECT a string, a number and a boolean path from the same row (all returned as text) + const [selError, selResult] = await executeTSQL(client, { + name: "path-types-select", + query: + "SELECT output.str AS s, output.num AS n, output.flag AS f FROM task_runs WHERE run_id = 'run_t1'", + schema: z.object({ s: z.string(), n: z.string(), f: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(selError).toBeNull(); + expect(selResult?.rows).toEqual([{ s: "hello", n: "42", f: "true" }]); + + // WHERE on a string path + const [strError, strResult] = await executeTSQL(client, { + name: "path-where-string", + query: "SELECT run_id FROM task_runs WHERE output.str = 'hello'", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(strError).toBeNull(); + expect(strResult?.rows).toEqual([{ run_id: "run_t1" }]); + + // WHERE equality on a numeric path with a numeric literal (numeric comparison) + const [numEqError, numEqResult] = await executeTSQL(client, { + name: "path-where-num-eq", + query: "SELECT run_id FROM task_runs WHERE output.num = 42", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(numEqError).toBeNull(); + expect(numEqResult?.rows).toEqual([{ run_id: "run_t1" }]); + + // WHERE ordering on a numeric path: comparison is numeric, not lexical, so 9 must NOT + // match `> 40` even though the text '9' sorts after '40'. + const [numGtError, numGtResult] = await executeTSQL(client, { + name: "path-where-num-gt", + query: "SELECT run_id FROM task_runs WHERE output.num > 40 ORDER BY run_id", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(numGtError).toBeNull(); + expect(numGtResult?.rows).toEqual([{ run_id: "run_t1" }]); + + // WHERE on a boolean path with a boolean literal + const [boolError, boolResult] = await executeTSQL(client, { + name: "path-where-bool", + query: "SELECT run_id FROM task_runs WHERE output.flag = false", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(boolError).toBeNull(); + expect(boolResult?.rows).toEqual([{ run_id: "run_t2" }]); + } + ); + clickhouseTest("reads the bare value from output_raw", async ({ clickhouseContainer }) => { const client = new ClickhouseClient({ name: "test", diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index cb152456d4b..df0baaa7bda 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -1047,6 +1047,43 @@ describe("ClickHousePrinter", () => { expect(sql).toContain(`GROUP BY ${statusBridge}`); expect(sql).not.toContain(".:String"); }); + + it("compiles a numeric-literal comparison to a typed Float extractor", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.num = 42", ctx); + + expect(sql).toContain("equals(JSONExtractFloat(output_raw, 'num'), 42)"); + }); + + it("compiles numeric-literal ordering to a typed Float extractor (numeric, not lexical)", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.num > 40", ctx); + + expect(sql).toContain("greater(JSONExtractFloat(output_raw, 'num'), 40)"); + }); + + it("compiles a boolean-literal comparison to a typed Bool extractor", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.flag = true", ctx); + + expect(sql).toContain("equals(JSONExtractBool(output_raw, 'flag'), 1)"); + }); + + it("keeps the string bridge for string-literal comparisons", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.name = 'test'", ctx); + + expect(sql).toContain(`equals(${nameBridge},`); + expect(sql).not.toContain("JSONExtractFloat"); + }); + + it("keeps the string bridge for LIKE even against a numeric-looking path", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.num LIKE '%4%'", ctx); + + expect(sql).toContain("like(if(JSONType(output_raw, 'num') = 'String'"); + expect(sql).not.toContain("JSONExtractFloat"); + }); }); describe("dataPrefix for JSON columns", () => { diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index 343ac3c291f..3dc80a35c36 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -1984,9 +1984,37 @@ export class ClickHousePrinter { const useTextColumn = textColumnOps.includes(node.op); const leftTextColumn = useTextColumn ? this.getTextColumnForExpression(node.left) : null; + // Type-directed extraction: when a rawColumn JSON path is compared against a numeric or + // boolean literal, extract the path as that type so the comparison is numeric/boolean (correct + // equality and ordering) rather than a string comparison. LIKE-family operators are excluded + // (string matching only). String literals keep the default string bridge. + const likeOps = [ + CompareOperationOp.Like, + CompareOperationOp.ILike, + CompareOperationOp.NotLike, + CompareOperationOp.NotILike, + ]; + let typedLeft: string | null = null; + let typedRight: string | null = null; + if (!likeOps.includes(node.op)) { + const leftChain = this.rawColumnPathChain(node.left); + const rightKind = this.rawColumnComparisonKind(transformedRight); + if (leftChain && rightKind) { + typedLeft = this.getRawColumnAccessForField(leftChain, rightKind); + } else { + const rightChain = this.rawColumnPathChain(transformedRight); + const leftKind = this.rawColumnComparisonKind(node.left); + if (rightChain && leftKind) { + typedRight = this.getRawColumnAccessForField(rightChain, leftKind); + } + } + } + // Build the left side, qualifying the text column with table alias if present let left: string; - if (leftTextColumn) { + if (typedLeft) { + left = typedLeft; + } else if (leftTextColumn) { // Check if the field is qualified with a table alias (e.g., r.output) // and prepend that alias to the text column to avoid ambiguity in JOINs const fieldNode = node.left as Field; @@ -2004,7 +2032,7 @@ export class ClickHousePrinter { } else { left = this.visit(node.left); } - const right = this.visit(transformedRight); + const right = typedRight ?? this.visit(transformedRight); switch (node.op) { case CompareOperationOp.Eq: @@ -2627,7 +2655,22 @@ export class ClickHousePrinter { * same access produces byte-identical SQL in SELECT and GROUP BY, which ClickHouse requires * for the expressions to match. */ - private getRawColumnAccessForField(chain: Array): string | null { + private getRawColumnAccessForField( + chain: Array, + kind: "string" | "float" | "bool" = "string" + ): string | null { + const resolved = this.resolveRawColumnPath(chain); + if (resolved === null) return null; + return this.buildRawColumnExtract(resolved.rawColumnExpr, resolved.keyArgs, kind); + } + + /** + * Resolve a field chain to the underlying raw String column and JSONExtract key arguments, + * if (and only if) it is JSON-path access on a column with a `rawColumn`. Returns null otherwise. + */ + private resolveRawColumnPath( + chain: Array + ): { rawColumnExpr: string; keyArgs: string } | null { if (chain.length < 2) return null; const firstPart = chain[0]; @@ -2659,13 +2702,31 @@ export class ClickHousePrinter { if (pathParts.length === 0) return null; - const keyArgs = this.buildJsonExtractKeyArgs(pathParts); + return { rawColumnExpr, keyArgs: this.buildJsonExtractKeyArgs(pathParts) }; + } + + /** + * Build the JSONExtract bridge expression over a raw String column. + * + * - `float`/`bool`: extract a typed scalar so comparisons against numeric/boolean literals are + * numeric (correct equality and ordering), not string comparisons. + * - `string` (default): return string scalars unquoted (keeps `=`, LIKE and display faithful) + * and object/array subtrees as raw JSON text; a missing key yields an empty string. + */ + private buildRawColumnExtract( + rawColumnExpr: string, + keyArgs: string, + kind: "string" | "float" | "bool" + ): string { + if (kind === "float") { + return `JSONExtractFloat(${rawColumnExpr}, ${keyArgs})`; + } + if (kind === "bool") { + return `JSONExtractBool(${rawColumnExpr}, ${keyArgs})`; + } const type = `JSONType(${rawColumnExpr}, ${keyArgs})`; const string = `JSONExtractString(${rawColumnExpr}, ${keyArgs})`; const raw = `JSONExtractRaw(${rawColumnExpr}, ${keyArgs})`; - // String leaf -> unquoted value (keeps =/LIKE/display faithful). Everything else (number, - // bool, object, array, null) -> raw JSON text, which preserves subtrees. A missing key is - // not a String either, so it falls to JSONExtractRaw and yields an empty string. return `if(${type} = 'String', ${string}, ${raw})`; } @@ -2679,6 +2740,29 @@ export class ClickHousePrinter { .join(", "); } + /** + * If an expression is JSON-path access on a `rawColumn` column, return its field chain so the + * caller can build a typed extractor; otherwise null. + */ + private rawColumnPathChain(node: Expression): Array | null { + if ((node as Field).expression_type !== "field") return null; + const chain = (node as Field).chain; + return this.resolveRawColumnPath(chain) ? chain : null; + } + + /** + * The extractor kind to use when comparing a rawColumn JSON path against this expression, based + * on the expression being a numeric or boolean literal. Returns null for strings / non-constants + * (which use the default string bridge). + */ + private rawColumnComparisonKind(node: Expression): "float" | "bool" | null { + if ((node as Constant).expression_type !== "constant") return null; + const value = (node as Constant).value; + if (typeof value === "number") return "float"; + if (typeof value === "boolean") return "bool"; + return null; + } + /** * Resolve a field chain to its column schema (if it references a known column) */ From 4886c7e0991b094be0c23db5b2e149f92c2b8265 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 30 Jun 2026 22:55:48 +0100 Subject: [PATCH 4/4] feat(tsql): apply typed path extraction to BETWEEN on output_raw BETWEEN on a String-backed JSON path with numeric bounds now extracts the path via JSONExtractFloat (matching the comparison operators), so range checks are numeric rather than lexical string comparisons. --- internal-packages/clickhouse/src/tsql.test.ts | 11 +++++++++++ internal-packages/tsql/src/query/printer.test.ts | 7 +++++++ internal-packages/tsql/src/query/printer.ts | 8 +++++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/internal-packages/clickhouse/src/tsql.test.ts b/internal-packages/clickhouse/src/tsql.test.ts index a7d3141eaca..2fd9fbf1955 100644 --- a/internal-packages/clickhouse/src/tsql.test.ts +++ b/internal-packages/clickhouse/src/tsql.test.ts @@ -1756,6 +1756,17 @@ describe("Field Mapping Tests", () => { expect(numGtError).toBeNull(); expect(numGtResult?.rows).toEqual([{ run_id: "run_t1" }]); + // WHERE numeric range on a path: BETWEEN is also numeric, so only 9 falls in [5, 20] + const [betweenError, betweenResult] = await executeTSQL(client, { + name: "path-where-num-between", + query: "SELECT run_id FROM task_runs WHERE output.num BETWEEN 5 AND 20", + schema: z.object({ run_id: z.string() }), + enforcedWhereClause: tenant, + tableSchema: [outputSchema], + }); + expect(betweenError).toBeNull(); + expect(betweenResult?.rows).toEqual([{ run_id: "run_t2" }]); + // WHERE on a boolean path with a boolean literal const [boolError, boolResult] = await executeTSQL(client, { name: "path-where-bool", diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index df0baaa7bda..d4804fbe2b2 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -1084,6 +1084,13 @@ describe("ClickHousePrinter", () => { expect(sql).toContain("like(if(JSONType(output_raw, 'num') = 'String'"); expect(sql).not.toContain("JSONExtractFloat"); }); + + it("compiles a numeric BETWEEN on a path to a typed Float extractor", () => { + const ctx = createRawColumnContext(); + const { sql } = printQuery("SELECT id FROM runs WHERE output.num BETWEEN 10 AND 20", ctx); + + expect(sql).toContain("JSONExtractFloat(output_raw, 'num') BETWEEN 10 AND 20"); + }); }); describe("dataPrefix for JSON columns", () => { diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index 3dc80a35c36..2073ee6d7c5 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -2269,7 +2269,13 @@ export class ClickHousePrinter { } private visitBetweenExpr(node: BetweenExpr): string { - const expr = this.visit(node.expr); + // Mirror visitCompareOperation's type-directed extraction: a BETWEEN on a rawColumn JSON path + // with numeric (or boolean) bounds extracts the path as that type so the range check is + // numeric rather than a lexical string comparison. + const exprChain = this.rawColumnPathChain(node.expr); + const kind = this.rawColumnComparisonKind(node.low) ?? this.rawColumnComparisonKind(node.high); + const expr = + exprChain && kind ? this.getRawColumnAccessForField(exprChain, kind)! : this.visit(node.expr); const low = this.visit(node.low); const high = this.visit(node.high); const notKw = node.negated ? " NOT" : "";