Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7c8ffe6
Add initial configs for Feb evaluation
dmjoy Dec 18, 2025
7652264
Update TA3 client version; update ICL databases
dmjoy Dec 19, 2025
1dab7b4
Update choose_a and choose_b experiments to point at AWS TA3 server
dmjoy Dec 19, 2025
46815ee
Add spectrum-tuned configs
eveenhuis Dec 24, 2025
0ce2d74
Add observation configs
eveenhuis Dec 31, 2025
a7ca086
Quickfix for direct regression PS scenarios
dmjoy Jan 1, 2026
76c187d
Upgrade TA3 client version dep
dmjoy Jan 1, 2026
21a434d
Multi-attribute run tweaks and experiment configs
dmjoy Jan 7, 2026
ea8060a
Set up caching for ICL step
dmjoy Jan 7, 2026
23073d0
Add live eval configs for multi-target
dmjoy Jan 9, 2026
914a6bc
Add direct reg multi configs with different backbones
dmjoy Jan 13, 2026
5406d38
MF z-score values updated (2026-01-21)
dmjoy Jan 26, 2026
c47e6a7
Add and enable caching for tagging ADMs
dmjoy Feb 11, 2026
b353060
Attempt to get the LLM to provide START tag rather than what it think…
aaron-bray Feb 13, 2026
bbb3027
Removing protocol names from tagging prompts to try get better object…
aaron-bray Feb 16, 2026
9bd2be1
Add icl for each tagging protocol
aaron-bray Feb 16, 2026
7e7753c
Another round of tagging prompt and icl changes for evaluation
aaron-bray Feb 19, 2026
1d6dd93
Changing protocol tags from colors to shapes
aaron-bray Feb 20, 2026
69c5b83
Missed changing some text from color to shape in our tagging prompt
aaron-bray Feb 20, 2026
f2b82af
Add vergiabe to tagging prompts for LLM to stop advancing through the…
aaron-bray Feb 21, 2026
d122dc0
More verbiage updates to tagging prompts
aaron-bray Feb 23, 2026
2eefade
More detail added to extremity injuries
aaron-bray Feb 23, 2026
91457a7
An attempt at better control of start tagging
aaron-bray Feb 28, 2026
77e1ef1
Addexpected resulting tag to auto generated icl reasons
aaron-bray Mar 3, 2026
02f67b9
Support variable tag symbols
aaron-bray Mar 18, 2026
a6e5369
Default to hand crafted icl
aaron-bray Mar 18, 2026
2463342
Conform to outlines
aaron-bray Mar 18, 2026
1613151
Need to also pass in symbol type for different prompt symbols
aaron-bray Mar 19, 2026
8c1df05
New tagging ICL for shapes
aaron-bray Mar 23, 2026
393f6f4
Remove clinical definition from color, and specific protocol names fr…
aaron-bray Mar 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ __pycache__/
outputs
slurm*.out

.idea/
.vscode/

5 changes: 3 additions & 2 deletions align_system/algorithms/alignment_adm_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,14 +327,15 @@ def run_returns(self):

def _compute_p_choose_a(self, kdma, intercept, medical_weight, attr_weight, raw_medical_delta, raw_attr_score):
# Provided by ADEPT 2025-12-12
# MF updated 2026-01-21
scaling = {
"affiliation": {
"medical": [0.403801, 0.297245],
"attribute": [0.405073, 0.298288],
},
"merit": {
"medical": [0.433409, 0.308294],
"attribute": [0.357632, 0.27947],
"medical": [0.428961, 0.301250],
"attribute": [0.337618, 0.272520],
},
"personal_safety": {
"medical": [0.456221, 0.246484],
Expand Down
71 changes: 68 additions & 3 deletions align_system/algorithms/icl_adm_component.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import re
import inspect
import copy
from functools import lru_cache
from collections.abc import Mapping

import ubelt as ub

from align_system.utils import logging, call_with_coerced_args
from align_system.utils.alignment_utils import attributes_in_alignment_target
from align_system.algorithms.abstracts import ADMComponent
Expand Down Expand Up @@ -37,7 +42,8 @@ def __init__(self,
scenario_description_template,
prompt_template,
attributes=None,
target_attribute_names_override=None):
target_attribute_names_override=None,
enable_caching=False):
self.icl_generator_partial = icl_generator_partial
self.scenario_description_template = scenario_description_template

Expand All @@ -49,6 +55,8 @@ def __init__(self,

self.target_attribute_names_override = target_attribute_names_override

self.enable_caching = enable_caching

def run_returns(self):
return ('icl_dialog_elements', 'icl_example_info')

Expand Down Expand Up @@ -77,6 +85,29 @@ def run(self,

target_attributes = [self.attributes[n] for n in target_attribute_names]

if self.enable_caching:
scenario_state_copy = copy.deepcopy(scenario_state)
if hasattr(scenario_state, 'elapsed_time'):
# Don't consider the elapsed_time of the state when caching
scenario_state_copy.elapsed_time = 0

depends = '\n'.join((
self.cache_repr(),
repr(scenario_state_copy),
repr(choices),
repr(target_attribute_names)))

cacher = ub.Cacher('icl_adm_component', depends, verbose=0)
log.debug(f'cacher.fpath={cacher.fpath}')

cached_output = cacher.tryload()
if cached_output is not None:
log.info("Cache hit for `icl_adm_component`"
" returning cached output")
return cached_output
else:
log.info("Cache miss for `icl_adm_component` ..")

# Mapping covers `dict` and `omegaconf.dictconfig.DictConfig`
if not isinstance(alignment_target, Mapping):
alignment_target_dict = alignment_target.to_dict()
Expand All @@ -85,7 +116,8 @@ def run(self,

alignment_target_value_lookup = {
kdma_values['kdma']: kdma_values['value']
for kdma_values in alignment_target_dict['kdma_values']}
for kdma_values in alignment_target_dict['kdma_values']
if 'value' in kdma_values}

icl_dialog_elements = {}
icl_example_info = {}
Expand Down Expand Up @@ -148,7 +180,40 @@ def run(self,
}
icl_example_info[attribute.kdma].append(icl_info)

return icl_dialog_elements, icl_example_info
outputs = (icl_dialog_elements, icl_example_info)

if self.enable_caching:
cacher.save(outputs)

return outputs

def cache_repr(self):
'''
Return a string representation of this object for caching;
.i.e. if the return value of this function is the same for two
object instances, it's assumed that `run` output will be
the same if given the same parameters
'''

def _generic_object_repr(obj):
init_params = inspect.signature(obj.__class__.__init__).parameters
obj_vars = vars(obj)

return "{}.{}({})".format(
obj.__class__.__module__,
obj.__class__.__name__,
", ".join([f"{p}={obj_vars[p]}" for p in init_params
if p != 'self' and p != 'args' and p != 'kwargs']))

return re.sub(r'^\s+', '',
f"""
{self.__class__.__module__}.{self.__class__.__name__}(
icl_generator_partial={self.icl_generator_partial},
scenario_description_template={_generic_object_repr(self.scenario_description_template)},
prompt_template={_generic_object_repr(self.prompt_template)},
attributes={self.attributes},
target_attribute_names_override={self.target_attribute_names_override},
)""", flags=re.MULTILINE).strip()


# ICL Engines dependent on alignment target, but that could change
Expand Down
78 changes: 76 additions & 2 deletions align_system/algorithms/prompt_based_aligned_adm_component.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import re
import inspect
import copy

from rich.highlighter import JSONHighlighter
from swagger_client.models import KDMAValue
import ubelt as ub

from align_system.utils import logging, call_with_coerced_args
from align_system.algorithms.abstracts import ADMComponent
Expand All @@ -24,7 +29,8 @@ def __init__(self,
num_negative_samples=0,
vote_calculator_fn=calculate_votes,
filter_votes_to_positives=True,
shuffle_choices=True):
shuffle_choices=True,
enable_caching=False):
self.structured_inference_engine = structured_inference_engine
self.scenario_description_template = scenario_description_template
self.prompt_template = prompt_template
Expand All @@ -40,6 +46,8 @@ def __init__(self,

self.shuffle_choices = shuffle_choices

self.enable_caching = enable_caching

def run_returns(self):
return ('chosen_choice', 'justification', 'dialog')

Expand All @@ -61,6 +69,31 @@ def run(self,
# Assumption here is that KDMA values range from 0-1
negative_value = 1 - value

if self.enable_caching:
scenario_state_copy = copy.deepcopy(scenario_state)
if hasattr(scenario_state, 'elapsed_time'):
# Don't consider the elapsed_time of the state when caching
scenario_state_copy.elapsed_time = 0

depends = '\n'.join((
self.cache_repr(),
repr(scenario_state_copy),
repr(choices),
repr(positive_icl_dialog_elements),
repr(negative_icl_dialog_elements),
repr(kdma_value)))

cacher = ub.Cacher('prompt_based_aligned_adm_component', depends, verbose=0)
log.debug(f'cacher.fpath={cacher.fpath}')

cached_output = cacher.tryload()
if cached_output is not None:
log.info("Cache hit for `prompt_based_aligned_adm_component`"
" returning cached output")
return cached_output
else:
log.info("Cache miss for `prompt_based_aligned_adm_component` ..")

scenario_description = call_with_coerced_args(
self.scenario_description_template,
{'scenario_state': scenario_state})
Expand Down Expand Up @@ -182,4 +215,45 @@ def run(self,
top_choice_justification = response['detailed_reasoning']
break

return top_choice, top_choice_justification, positive_dialog
outputs = (top_choice, top_choice_justification, positive_dialog)

if self.enable_caching:
cacher.save(outputs)

return outputs

def cache_repr(self):
'''
Return a string representation of this object for caching;
.i.e. if the return value of this function is the same for two
object instances, it's assumed that `run` output will be
the same if given the same parameters
'''

def _generic_object_repr(obj):
if obj is None:
return "None"

init_params = inspect.signature(obj.__class__.__init__).parameters
obj_vars = vars(obj)

return "{}.{}({})".format(
obj.__class__.__module__,
obj.__class__.__name__,
", ".join([f"{p}={obj_vars[p]}" for p in init_params
if p != 'self' and p != 'args' and p != 'kwargs']))

return re.sub(r'^\s+', '',
f"""
{self.__class__.__module__}.{self.__class__.__name__}(
structured_inference_engine={self.structured_inference_engine.cache_repr()},
scenario_description_template={_generic_object_repr(self.scenario_description_template)},
prompt_template={_generic_object_repr(self.prompt_template)},
output_schema_template={_generic_object_repr(self.output_schema_template)},
system_prompt_template={_generic_object_repr(self.system_prompt_template)},
num_positive_samples={self.num_positive_samples},
num_negative_samples={self.num_negative_samples},
vote_calculator_fn={_generic_object_repr(self.vote_calculator_fn)},
filter_votes_to_positives={self.filter_votes_to_positives},
shuffle_choices={self.shuffle_choices},
)""", flags=re.MULTILINE).strip()
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: phase2_pipeline_zeroshot_comparative_regression_swap_average
name: phase2_pipeline_direct_medical_regression

defaults:
# Import defaults into this namspace (adm) as @name, for further
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: phase2_pipeline_zeroshot_comparative_regression_swap_average
name: phase2_pipeline_direct_regression

defaults:
# Import defaults into this namspace (adm) as @name, for further
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# @package _global_
defaults:
- override /adm: pipeline_baseline
- override /inference_engine@adm.structured_inference_engine: outlines_structured_greedy
- override /interface: ta3

interface:
api_endpoint: "https://darpaitm.caci.com"
session_type: adept
training_session: full
username: "testrun-pipeline_baseline"
domain: "p2triage"

adm:
step_definitions:
outlines_baseline:
scenario_description_template:
_target_: align_system.prompt_engineering.outlines_prompts.Phase2ScenarioDescription
prompt_template:
_target_: align_system.prompt_engineering.outlines_prompts.Phase2BaselinePrompt

enable_caching: true

instance:
steps:
# Reference the step instances we want to use in order
- ${ref:adm.step_definitions.format_choices}
- ${ref:adm.step_definitions.outlines_baseline}
# - ${ref:adm.step_definitions.action_parameter_completion}
- ${ref:adm.step_definitions.ensure_chosen_action}
- ${ref:adm.step_definitions.populate_choice_info}

driver:
apply_action_filtering: false

force_determinism: true
align_to_target: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# @package _global_
defaults:
- override /adm: pipeline_baseline
- override /interface: ta3

interface:
api_endpoint: "https://darpaitm.caci.com"
session_type: eval
training_session: null
username: "ALIGN-ADM-OutlinesBaseline-DeepSeek-R1-Distill-Llama-8B"
domain: "p2triage"

adm:
structured_inference_engine:
model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B

step_definitions:
outlines_baseline:
scenario_description_template:
_target_: align_system.prompt_engineering.outlines_prompts.Phase2ScenarioDescription
prompt_template:
_target_: align_system.prompt_engineering.outlines_prompts.Phase2BaselinePrompt

enable_caching: true

instance:
steps:
# Reference the step instances we want to use in order
- ${ref:adm.step_definitions.format_choices}
- ${ref:adm.step_definitions.outlines_baseline}
# - ${ref:adm.step_definitions.action_parameter_completion}
- ${ref:adm.step_definitions.ensure_chosen_action}
- ${ref:adm.step_definitions.populate_choice_info}

driver:
apply_action_filtering: false

force_determinism: true
align_to_target: false
save_last_unstructured_state_per_scenario: true

hydra:
run:
dir: 'phase2_feb2026_multi_results_live/phase2_baseline_deepseek_llama_live_eval_multi/${now:%Y-%m-%d__%H-%M-%S}'
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# @package _global_
defaults:
- override /adm: pipeline_baseline
- override /interface: ta3

interface:
api_endpoint: "https://darpaitm.caci.com"
session_type: eval
training_session: null
username: "testrun-pipeline_baseline_deepseek_llama"
domain: "p2triage"

adm:
structured_inference_engine:
model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B

step_definitions:
outlines_baseline:
scenario_description_template:
_target_: align_system.prompt_engineering.outlines_prompts.Phase2ScenarioDescription
prompt_template:
_target_: align_system.prompt_engineering.outlines_prompts.Phase2BaselinePrompt

enable_caching: true

instance:
steps:
# Reference the step instances we want to use in order
- ${ref:adm.step_definitions.format_choices}
- ${ref:adm.step_definitions.outlines_baseline}
# - ${ref:adm.step_definitions.action_parameter_completion}
- ${ref:adm.step_definitions.ensure_chosen_action}
- ${ref:adm.step_definitions.populate_choice_info}

driver:
apply_action_filtering: false

force_determinism: true
align_to_target: false
save_last_unstructured_state_per_scenario: true

hydra:
run:
dir: 'phase2_feb2026_results_local/phase2_baseline_deepseek_llama_live_eval_test/${now:%Y-%m-%d__%H-%M-%S}'
Loading