Skip to content

Commit 250433e

Browse files
chore: add missing docstrings
1 parent d64e474 commit 250433e

18 files changed

+540
-0
lines changed

src/codex/types/organization_list_members_response.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010

1111
class OrganizationListMembersResponseItem(BaseModel):
12+
"""Schema for public organization member information."""
13+
1214
email: str
1315

1416
name: str

src/codex/types/project_create_params.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class ProjectCreateParams(TypedDict, total=False):
3939

4040

4141
class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
42+
"""message, priority, type"""
43+
4244
message: Required[str]
4345
"""
4446
Fallback message to use if this eval fails and causes the response to be
@@ -56,6 +58,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False
5658

5759

5860
class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
61+
"""A custom evaluation metric created by users.
62+
63+
The TLMEvalSchema are mutable and stored in the database.
64+
"""
65+
5966
criteria: Required[str]
6067
"""
6168
The evaluation criteria text that describes what aspect is being evaluated and
@@ -120,10 +127,14 @@ class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
120127

121128

122129
class ConfigEvalConfigCustomEvals(TypedDict, total=False):
130+
"""Configuration for custom evaluation metrics."""
131+
123132
evals: Dict[str, ConfigEvalConfigCustomEvalsEvals]
124133

125134

126135
class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
136+
"""message, priority, type"""
137+
127138
message: Required[str]
128139
"""
129140
Fallback message to use if this eval fails and causes the response to be
@@ -141,6 +152,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDic
141152

142153

143154
class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
155+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
156+
157+
The evaluation criteria and identifiers are immutable and system-managed,
158+
while other properties like thresholds and priorities can be configured.
159+
"""
160+
144161
eval_key: Required[str]
145162
"""
146163
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -179,6 +196,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
179196

180197

181198
class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
199+
"""message, priority, type"""
200+
182201
message: Required[str]
183202
"""
184203
Fallback message to use if this eval fails and causes the response to be
@@ -196,6 +215,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=
196215

197216

198217
class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
218+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
219+
220+
The evaluation criteria and identifiers are immutable and system-managed,
221+
while other properties like thresholds and priorities can be configured.
222+
"""
223+
199224
eval_key: Required[str]
200225
"""
201226
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -234,6 +259,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
234259

235260

236261
class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
262+
"""message, priority, type"""
263+
237264
message: Required[str]
238265
"""
239266
Fallback message to use if this eval fails and causes the response to be
@@ -251,6 +278,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedD
251278

252279

253280
class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
281+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
282+
283+
The evaluation criteria and identifiers are immutable and system-managed,
284+
while other properties like thresholds and priorities can be configured.
285+
"""
286+
254287
eval_key: Required[str]
255288
"""
256289
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -289,6 +322,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
289322

290323

291324
class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
325+
"""message, priority, type"""
326+
292327
message: Required[str]
293328
"""
294329
Fallback message to use if this eval fails and causes the response to be
@@ -306,6 +341,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDi
306341

307342

308343
class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
344+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
345+
346+
The evaluation criteria and identifiers are immutable and system-managed,
347+
while other properties like thresholds and priorities can be configured.
348+
"""
349+
309350
eval_key: Required[str]
310351
"""
311352
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -344,6 +385,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
344385

345386

346387
class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
388+
"""message, priority, type"""
389+
347390
message: Required[str]
348391
"""
349392
Fallback message to use if this eval fails and causes the response to be
@@ -361,6 +404,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict,
361404

362405

363406
class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
407+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
408+
409+
The evaluation criteria and identifiers are immutable and system-managed,
410+
while other properties like thresholds and priorities can be configured.
411+
"""
412+
364413
eval_key: Required[str]
365414
"""
366415
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -399,6 +448,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
399448

400449

401450
class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
451+
"""Configuration for default evaluation metrics."""
452+
402453
context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency
403454
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
404455
@@ -436,6 +487,8 @@ class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
436487

437488

438489
class ConfigEvalConfig(TypedDict, total=False):
490+
"""Configuration for project-specific evaluation metrics"""
491+
439492
custom_evals: ConfigEvalConfigCustomEvals
440493
"""Configuration for custom evaluation metrics."""
441494

src/codex/types/project_detect_params.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,8 @@ class ResponseChatCompletionTyped(TypedDict, total=False):
440440

441441

442442
class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
443+
"""message, priority, type"""
444+
443445
message: Required[str]
444446
"""
445447
Fallback message to use if this eval fails and causes the response to be
@@ -457,6 +459,11 @@ class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
457459

458460

459461
class EvalConfigCustomEvalsEvals(TypedDict, total=False):
462+
"""A custom evaluation metric created by users.
463+
464+
The TLMEvalSchema are mutable and stored in the database.
465+
"""
466+
460467
criteria: Required[str]
461468
"""
462469
The evaluation criteria text that describes what aspect is being evaluated and
@@ -521,10 +528,14 @@ class EvalConfigCustomEvalsEvals(TypedDict, total=False):
521528

522529

523530
class EvalConfigCustomEvals(TypedDict, total=False):
531+
"""Configuration for custom evaluation metrics."""
532+
524533
evals: Dict[str, EvalConfigCustomEvalsEvals]
525534

526535

527536
class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
537+
"""message, priority, type"""
538+
528539
message: Required[str]
529540
"""
530541
Fallback message to use if this eval fails and causes the response to be
@@ -542,6 +553,12 @@ class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, tot
542553

543554

544555
class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
556+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
557+
558+
The evaluation criteria and identifiers are immutable and system-managed,
559+
while other properties like thresholds and priorities can be configured.
560+
"""
561+
545562
eval_key: Required[str]
546563
"""
547564
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -580,6 +597,8 @@ class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
580597

581598

582599
class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
600+
"""message, priority, type"""
601+
583602
message: Required[str]
584603
"""
585604
Fallback message to use if this eval fails and causes the response to be
@@ -597,6 +616,12 @@ class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False)
597616

598617

599618
class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
619+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
620+
621+
The evaluation criteria and identifiers are immutable and system-managed,
622+
while other properties like thresholds and priorities can be configured.
623+
"""
624+
600625
eval_key: Required[str]
601626
"""
602627
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -635,6 +660,8 @@ class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
635660

636661

637662
class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
663+
"""message, priority, type"""
664+
638665
message: Required[str]
639666
"""
640667
Fallback message to use if this eval fails and causes the response to be
@@ -652,6 +679,12 @@ class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, t
652679

653680

654681
class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
682+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
683+
684+
The evaluation criteria and identifiers are immutable and system-managed,
685+
while other properties like thresholds and priorities can be configured.
686+
"""
687+
655688
eval_key: Required[str]
656689
"""
657690
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -690,6 +723,8 @@ class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
690723

691724

692725
class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
726+
"""message, priority, type"""
727+
693728
message: Required[str]
694729
"""
695730
Fallback message to use if this eval fails and causes the response to be
@@ -707,6 +742,12 @@ class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, to
707742

708743

709744
class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
745+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
746+
747+
The evaluation criteria and identifiers are immutable and system-managed,
748+
while other properties like thresholds and priorities can be configured.
749+
"""
750+
710751
eval_key: Required[str]
711752
"""
712753
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -745,6 +786,8 @@ class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
745786

746787

747788
class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
789+
"""message, priority, type"""
790+
748791
message: Required[str]
749792
"""
750793
Fallback message to use if this eval fails and causes the response to be
@@ -762,6 +805,12 @@ class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=
762805

763806

764807
class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
808+
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
809+
810+
The evaluation criteria and identifiers are immutable and system-managed,
811+
while other properties like thresholds and priorities can be configured.
812+
"""
813+
765814
eval_key: Required[str]
766815
"""
767816
Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -800,6 +849,8 @@ class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
800849

801850

802851
class EvalConfigDefaultEvals(TypedDict, total=False):
852+
"""Configuration for default evaluation metrics."""
853+
803854
context_sufficiency: EvalConfigDefaultEvalsContextSufficiency
804855
"""A pre-configured evaluation metric from TrustworthyRAG or built into the system.
805856
@@ -837,6 +888,8 @@ class EvalConfigDefaultEvals(TypedDict, total=False):
837888

838889

839890
class EvalConfig(TypedDict, total=False):
891+
"""All of the evals that should be used for this query"""
892+
840893
custom_evals: EvalConfigCustomEvals
841894
"""Configuration for custom evaluation metrics."""
842895

@@ -1041,6 +1094,80 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
10411094

10421095

10431096
class Options(TypedDict, total=False):
1097+
"""
1098+
Typed dict of advanced configuration options for the Trustworthy Language Model.
1099+
Many of these configurations are determined by the quality preset selected
1100+
(learn about quality presets in the TLM [initialization method](./#class-tlm)).
1101+
Specifying TLMOptions values directly overrides any default values set from the quality preset.
1102+
1103+
For all options described below, higher settings will lead to longer runtimes and may consume more tokens internally.
1104+
You may not be able to run long prompts (or prompts with long responses) in your account,
1105+
unless your token/rate limits are increased. If you hit token limit issues, try lower/less expensive TLMOptions
1106+
to be able to run longer prompts/responses, or contact Cleanlab to increase your limits.
1107+
1108+
The default values corresponding to each quality preset are:
1109+
- **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
1110+
- **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
1111+
- **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
1112+
- **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"none"`.
1113+
- **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, `reasoning_effort` = `"none"`.
1114+
1115+
By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512.
1116+
You can set custom values for these arguments regardless of the quality preset specified.
1117+
1118+
Args:
1119+
model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield better results, faster models yield faster results).
1120+
- Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku".
1121+
- Recommended models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0".
1122+
- Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro".
1123+
1124+
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
1125+
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
1126+
1127+
custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
1128+
The expected input format is a list of dictionaries, where each dictionary has the following keys:
1129+
- name: Name of the evaluation criteria.
1130+
- criteria: Instructions specifying the evaluation criteria.
1131+
1132+
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring.
1133+
If you experience token/rate-limit errors, try lowering this number.
1134+
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
1135+
1136+
reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
1137+
when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
1138+
Reduce this value to reduce runtimes. Higher values may improve trust scoring.
1139+
1140+
num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring.
1141+
The maximum number currently supported is 3. Lower values can reduce runtimes.
1142+
Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
1143+
This parameter has no effect when `disable_trustworthiness` is True.
1144+
1145+
num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring.
1146+
Must be between 0 and 20. Lower values can reduce runtimes.
1147+
Measuring consistency helps quantify the epistemic uncertainty associated with
1148+
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
1149+
TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
1150+
This parameter has no effect when `disable_trustworthiness` is True.
1151+
1152+
similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
1153+
trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
1154+
Supported similarity measures include - "semantic" (based on natural language inference),
1155+
"embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
1156+
"code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
1157+
and "string" (based on character/word overlap). Set this to "string" for minimal runtimes.
1158+
This parameter has no effect when `num_consistency_samples = 0`.
1159+
1160+
num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
1161+
`TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
1162+
You can auto-improve responses by increasing this parameter, but at higher runtimes/costs.
1163+
This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
1164+
When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
1165+
This parameter has no effect when `disable_trustworthiness` is True.
1166+
1167+
disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores,
1168+
useful if you only want to compute custom evaluation criteria.
1169+
"""
1170+
10441171
custom_eval_criteria: Iterable[object]
10451172

10461173
disable_persistence: bool

0 commit comments

Comments
 (0)