From fad11bf80084981b3c54d64ccd36bb628322b421 Mon Sep 17 00:00:00 2001
From: jli <jli@together.ai>
Date: Thu, 7 May 2026 09:55:02 -0700
Subject: [PATCH] Add max_tokens, temperature, and num_workers to judge model
 config

Expose three optional fields on EvaluationJudgeModelConfig:
- max_tokens: lets users override the default (32768) for judge models.
  Critical for reasoning models (e.g. Gemini, o-series) that consume
  output token budget on chain-of-thought before emitting visible content,
  causing truncated JSON and parse failures at the default limit.
- temperature: lets users override the judge sampling temperature (default 0.05).
- num_workers: concurrent workers for judge inference requests, useful for
  proxy endpoints like OpenRouter.

Also add num_workers and max_tokens to EvaluationModelRequest for the
models being evaluated.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 openapi.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/openapi.yaml b/openapi.yaml
index 05b7d40..1f27f25 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -12338,6 +12338,18 @@ components:
           minimum: 1
           description: "Number of concurrent workers for inference requests. Overrides the default concurrency for this model. Useful for tuning throughput when using proxy endpoints (e.g. OpenRouter) or rate-limited external APIs."
           example: 5
+        max_tokens:
+          type: integer
+          minimum: 1
+          description: "Maximum number of tokens the judge model can generate. Defaults to 32768. Increase for reasoning models (e.g. Gemini, o-series) that consume output token budget for chain-of-thought."
+          example: 8192
+        temperature:
+          type: number
+          format: float
+          minimum: 0
+          maximum: 2
+          description: "Sampling temperature for the judge model. Defaults to 0.05."
+          example: 0.0
 
     EvaluationModelOrString:
       oneOf: