cleanlab
diff --git a/‎src/codex/types/organization_list_members_response.py‎
Lines changed: 2 additions & 0 deletions b/‎src/codex/types/organization_list_members_response.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/codex/types/project_create_params.py‎
Lines changed: 53 additions & 0 deletions b/‎src/codex/types/project_create_params.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎src/codex/types/project_detect_params.py‎
Lines changed: 127 additions & 0 deletions b/‎src/codex/types/project_detect_params.py‎
Lines changed: 127 additions & 0 deletions
@@ -9,6 +9,8 @@
 
 
 class OrganizationListMembersResponseItem(BaseModel):
+    """Schema for public organization member information."""
+
     email: str
 
     name: str
 
@@ -39,6 +39,8 @@ class ProjectCreateParams(TypedDict, total=False):
 
 
 class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -56,6 +58,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False
 
 
 class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: Required[str]
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -120,10 +127,14 @@ class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
 
 
 class ConfigEvalConfigCustomEvals(TypedDict, total=False):
+    """Configuration for custom evaluation metrics."""
+
     evals: Dict[str, ConfigEvalConfigCustomEvalsEvals]
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -141,6 +152,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDic
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -179,6 +196,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -196,6 +215,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=
 
 
 class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -234,6 +259,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -251,6 +278,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedD
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -289,6 +322,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -306,6 +341,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDi
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -344,6 +385,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -361,6 +404,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict,
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -399,6 +448,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -436,6 +487,8 @@ class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
 
 
 class ConfigEvalConfig(TypedDict, total=False):
+    """Configuration for project-specific evaluation metrics"""
+
     custom_evals: ConfigEvalConfigCustomEvals
     """Configuration for custom evaluation metrics."""
 
 
@@ -440,6 +440,8 @@ class ResponseChatCompletionTyped(TypedDict, total=False):
 
 
 class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -457,6 +459,11 @@ class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
 
 
 class EvalConfigCustomEvalsEvals(TypedDict, total=False):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: Required[str]
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -521,10 +528,14 @@ class EvalConfigCustomEvalsEvals(TypedDict, total=False):
 
 
 class EvalConfigCustomEvals(TypedDict, total=False):
+    """Configuration for custom evaluation metrics."""
+
     evals: Dict[str, EvalConfigCustomEvalsEvals]
 
 
 class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -542,6 +553,12 @@ class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, tot
 
 
 class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -580,6 +597,8 @@ class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -597,6 +616,12 @@ class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False)
 
 
 class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -635,6 +660,8 @@ class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -652,6 +679,12 @@ class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, t
 
 
 class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -690,6 +723,8 @@ class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -707,6 +742,12 @@ class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, to
 
 
 class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -745,6 +786,8 @@ class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -762,6 +805,12 @@ class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=
 
 
 class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -800,6 +849,8 @@ class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvals(TypedDict, total=False):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: EvalConfigDefaultEvalsContextSufficiency
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -837,6 +888,8 @@ class EvalConfigDefaultEvals(TypedDict, total=False):
 
 
 class EvalConfig(TypedDict, total=False):
+    """All of the evals that should be used for this query"""
+
     custom_evals: EvalConfigCustomEvals
     """Configuration for custom evaluation metrics."""
 
@@ -1041,6 +1094,80 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
 
 
 class Options(TypedDict, total=False):
+    """
+    Typed dict of advanced configuration options for the Trustworthy Language Model.
+    Many of these configurations are determined by the quality preset selected
+    (learn about quality presets in the TLM [initialization method](./#class-tlm)).
+    Specifying TLMOptions values directly overrides any default values set from the quality preset.
+
+    For all options described below, higher settings will lead to longer runtimes and may consume more tokens internally.
+    You may not be able to run long prompts (or prompts with long responses) in your account,
+    unless your token/rate limits are increased. If you hit token limit issues, try lower/less expensive TLMOptions
+    to be able to run longer prompts/responses, or contact Cleanlab to increase your limits.
+
+    The default values corresponding to each quality preset are:
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, `reasoning_effort` = `"none"`.
+
+    By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512.
+    You can set custom values for these arguments regardless of the quality preset specified.
+
+    Args:
+        model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o",          "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",           "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"):         Underlying base LLM to use (better models yield better results, faster models yield faster results).
+        - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku".
+        - Recommended models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0".
+        - Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro".
+
+        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+        The expected input format is a list of dictionaries, where each dictionary has the following keys:
+        - name: Name of the evaluation criteria.
+        - criteria: Instructions specifying the evaluation criteria.
+
+        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring.
+        If you experience token/rate-limit errors, try lowering this number.
+        For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
+
+        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+        Reduce this value to reduce runtimes. Higher values may improve trust scoring.
+
+        num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring.
+        The maximum number currently supported is 3. Lower values can reduce runtimes.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring.
+        Must be between 0 and 20. Lower values can reduce runtimes.
+        Measuring consistency helps quantify the epistemic uncertainty associated with
+        strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
+        TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+        trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+        Supported similarity measures include - "semantic" (based on natural language inference),
+        "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+        "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes.
+        This parameter has no effect when `num_consistency_samples = 0`.
+
+        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+        You can auto-improve responses by increasing this parameter, but at higher runtimes/costs.
+        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+        When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores,
+        useful if you only want to compute custom evaluation criteria.
+    """
+
     custom_eval_criteria: Iterable[object]
 
     disable_persistence: bool