Skip to content

Commit e9de721

Browse files
committed
fix: the color can't changed by geometric algebra cause grade-0 invariant and energy imbalance issue it cause model make color parts gradient weak and indirect
1 parent 12030ae commit e9de721

14 files changed

Lines changed: 789 additions & 616 deletions

File tree

conf/task/gtm.yaml

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,6 @@
11
# @package _global_
22
name: gtm
33

4-
# ── RTX Pro 4500 (32 GB VRAM, Ada Lovelace) tuning notes ──────────────
5-
#
6-
# VRAM budget breakdown (fp16 activations via AMP):
7-
# Phase 1 (demo): B=24, K=3, grid=30×30 → N_demo = 5400 cells
8-
# Attention [B,H,N,N]: 24×4×5400×5400×2B ≈ 5.6 GB
9-
# CPU state × 12 steps: 24×5400×16×2B × 12 ≈ 50 MB
10-
# Phase 2 (test): N_test = 900 cells → attention < 150 MB
11-
# Model params + optimizer: < 1 GB
12-
# Total: ~8–10 GB in fp16, safely within 32 GB
13-
#
14-
# Key CUDA flags:
15-
# amp: true — bf16 forward/backward (Ada Lovelace tensor cores)
16-
# compile: true — torch.compile for kernel fusion
17-
# cudnn_benchmark: true
18-
# pin_memory: true — async CPU→GPU transfer
19-
# num_workers: 4 — parallel data loading
20-
214
algebra:
225
p: 3
236
q: 0
@@ -49,20 +32,20 @@ dataset:
4932
toy_n_examples: 20000
5033
toy_max_grid_size: 15
5134
num_demos: 3
52-
epoch_samples: 0 # 0 = full dataset shuffle; set >0 for capped-epoch sampling
35+
epoch_samples: 4000 # 0 = full dataset shuffle; set >0 for capped-epoch sampling
5336

5437
training:
5538
epochs: 150
5639
lr: 0.0005
57-
batch_size: 24
40+
batch_size: 16
5841
optimizer_type: riemannian_adam
5942
max_bivector_norm: 10.0
6043

6144
# CUDA acceleration
6245
num_workers: 4
6346
pin_memory: true
6447
amp: true
65-
compile: true
48+
compile: false
6649
cudnn_benchmark: true
6750

6851
# Three-phase schedule (scaled for 150 epochs)
@@ -71,6 +54,10 @@ training:
7154
act_epochs: 70
7255
act_weight: 0.01
7356
act_ramp_epochs: 20
74-
gate_entropy_weight: 0.001
57+
gate_entropy_weight: 0.01
7558
grad_clip: 1.0
7659
eval_every: 5
60+
tau_start: 1.0
61+
tau_act_restart: 0.7
62+
tau_end: 0.1
63+

datalib/benchmarks.py

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
# Versor: Universal Geometric Algebra Neural Network
2+
# Copyright (C) 2026 Eunkyum Kim <nemonanconcode@gmail.com>
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
#
7+
8+
"""BIG-Bench Hard (BBH) data loading with curriculum learning support."""
9+
10+
import re
11+
import torch
12+
from torch.utils.data import Dataset, DataLoader, Sampler
13+
14+
15+
# ---------------------------------------------------------------------------
16+
# Task difficulty tiers for curriculum learning
17+
# ---------------------------------------------------------------------------
18+
19+
TASK_TIERS = {
20+
1: [ # Binary (2 choices) — basic pattern matching
21+
'boolean_expressions',
22+
'navigate',
23+
'sports_understanding',
24+
'web_of_lies',
25+
'causal_judgement',
26+
'formal_fallacies',
27+
],
28+
2: [ # Simple MC (2-4 choices) — moderate reasoning
29+
'disambiguation_qa',
30+
'hyperbaton',
31+
'snarks',
32+
'ruin_names',
33+
'logical_deduction_three_objects',
34+
'tracking_shuffled_objects_three_objects',
35+
'temporal_sequences',
36+
],
37+
3: [ # Complex MC (5+ choices) — multi-step reasoning
38+
'date_understanding',
39+
'movie_recommendation',
40+
'penguins_in_a_table',
41+
'salient_translation_error_detection',
42+
'logical_deduction_five_objects',
43+
'tracking_shuffled_objects_five_objects',
44+
'reasoning_about_colored_objects',
45+
'geometric_shapes',
46+
],
47+
}
48+
49+
ALL_CURRICULUM_TASKS = [t for tier in sorted(TASK_TIERS) for t in TASK_TIERS[tier]]
50+
51+
52+
def get_tier_for_task(task_name: str) -> int:
53+
for tier, tasks in TASK_TIERS.items():
54+
if task_name in tasks:
55+
return tier
56+
return 3
57+
58+
59+
# ---------------------------------------------------------------------------
60+
# Answer parsing
61+
# ---------------------------------------------------------------------------
62+
63+
BINARY_ANSWERS = {
64+
'True': 1, 'False': 0,
65+
'true': 1, 'false': 0,
66+
'Yes': 1, 'No': 0,
67+
'yes': 1, 'no': 0,
68+
'Valid': 1, 'Invalid': 0,
69+
'valid': 1, 'invalid': 0,
70+
}
71+
72+
MC_PATTERN = re.compile(r'^\(([A-Z])\)$')
73+
74+
75+
def _parse_answer(target: str, task_name: str) -> tuple:
76+
"""Parse a BBH target string into (label_index, num_choices).
77+
78+
Returns:
79+
(label_index, num_choices) tuple, or (None, None) if unparseable.
80+
"""
81+
target = target.strip()
82+
83+
if target in BINARY_ANSWERS:
84+
return BINARY_ANSWERS[target], 2
85+
86+
mc_match = MC_PATTERN.match(target)
87+
if mc_match:
88+
letter = mc_match.group(1)
89+
idx = ord(letter) - ord('A')
90+
return idx, None # num_choices determined by scanning all examples
91+
92+
# Unparseable (free-text answer) — skip gracefully
93+
return None, None
94+
95+
96+
# ---------------------------------------------------------------------------
97+
# Single-task dataset
98+
# ---------------------------------------------------------------------------
99+
100+
class BBHDataset(Dataset):
101+
"""BIG-Bench Hard dataset for a single task.
102+
103+
Loads from the lukaemon/bbh HuggingFace dataset, tokenizes with a
104+
provided tokenizer, and maps answers to class indices.
105+
Examples with unparseable answers are silently skipped.
106+
"""
107+
108+
def __init__(self, task_name: str, tokenizer, max_len: int = 512,
109+
split: str = 'test', num_choices: int = None):
110+
from datasets import load_dataset
111+
ds = load_dataset("lukaemon/bbh", task_name, trust_remote_code=True)
112+
113+
if split in ds:
114+
raw = ds[split]
115+
else:
116+
raw = ds[list(ds.keys())[0]]
117+
118+
# Parse answers, skip unparseable
119+
parsed = []
120+
texts = []
121+
max_choice = 0
122+
for example in raw:
123+
label, nc = _parse_answer(example['target'], task_name)
124+
if label is None:
125+
continue
126+
if nc is not None:
127+
max_choice = max(max_choice, nc)
128+
else:
129+
max_choice = max(max_choice, label + 1)
130+
parsed.append(label)
131+
texts.append(example['input'])
132+
133+
self.num_choices = num_choices or max_choice
134+
self.labels = parsed
135+
136+
encodings = tokenizer(
137+
texts,
138+
max_length=max_len,
139+
padding='max_length',
140+
truncation=True,
141+
return_tensors='pt',
142+
)
143+
self.input_ids = encodings['input_ids']
144+
self.attention_mask = encodings['attention_mask']
145+
146+
def __len__(self):
147+
return len(self.labels)
148+
149+
def __getitem__(self, idx):
150+
return {
151+
'input_ids': self.input_ids[idx],
152+
'attention_mask': self.attention_mask[idx],
153+
'labels': torch.tensor(self.labels[idx], dtype=torch.long),
154+
}
155+
156+
157+
# ---------------------------------------------------------------------------
158+
# Multi-task curriculum dataset
159+
# ---------------------------------------------------------------------------
160+
161+
class BBHCurriculumDataset(Dataset):
162+
"""Multi-task BBH dataset with per-example curriculum metadata.
163+
164+
Each example carries its task_id, tier, and num_valid_choices so the
165+
training loop can mask invalid logits and the curriculum sampler can
166+
select examples by difficulty tier.
167+
"""
168+
169+
def __init__(self, task_names, tokenizer, max_len: int = 512):
170+
all_input_ids = []
171+
all_attention_masks = []
172+
all_labels = []
173+
all_num_valid = []
174+
all_task_ids = []
175+
all_tiers = []
176+
177+
self.task_names = []
178+
self.task_num_choices = {}
179+
max_choices = 0
180+
181+
for task_id, task_name in enumerate(task_names):
182+
try:
183+
ds = BBHDataset(task_name, tokenizer, max_len)
184+
except Exception as e:
185+
print(f" Warning: skipping task {task_name}: {e}")
186+
continue
187+
188+
if len(ds) == 0:
189+
print(f" Warning: no parseable examples for {task_name}")
190+
continue
191+
192+
nc = ds.num_choices
193+
self.task_names.append(task_name)
194+
self.task_num_choices[task_name] = nc
195+
max_choices = max(max_choices, nc)
196+
tier = get_tier_for_task(task_name)
197+
198+
for i in range(len(ds)):
199+
all_input_ids.append(ds.input_ids[i])
200+
all_attention_masks.append(ds.attention_mask[i])
201+
all_labels.append(ds.labels[i])
202+
all_num_valid.append(nc)
203+
all_task_ids.append(task_id)
204+
all_tiers.append(tier)
205+
206+
self.input_ids = torch.stack(all_input_ids)
207+
self.attention_mask = torch.stack(all_attention_masks)
208+
self.labels = all_labels
209+
self.num_valid_choices = all_num_valid
210+
self.task_ids = all_task_ids
211+
self.tiers = all_tiers
212+
self.max_choices = max_choices
213+
214+
def __len__(self):
215+
return len(self.labels)
216+
217+
def __getitem__(self, idx):
218+
return {
219+
'input_ids': self.input_ids[idx],
220+
'attention_mask': self.attention_mask[idx],
221+
'labels': torch.tensor(self.labels[idx], dtype=torch.long),
222+
'num_valid_choices': torch.tensor(self.num_valid_choices[idx],
223+
dtype=torch.long),
224+
'task_id': torch.tensor(self.task_ids[idx], dtype=torch.long),
225+
}
226+
227+
228+
# ---------------------------------------------------------------------------
229+
# Curriculum sampler
230+
# ---------------------------------------------------------------------------
231+
232+
class CurriculumSampler(Sampler):
233+
"""Samples only from examples whose tier is in the active set."""
234+
235+
def __init__(self, tiers: list, active_tier_ids: set):
236+
active = set(active_tier_ids)
237+
self.indices = [i for i, t in enumerate(tiers) if t in active]
238+
239+
def __iter__(self):
240+
perm = torch.randperm(len(self.indices))
241+
return iter([self.indices[i] for i in perm])
242+
243+
def __len__(self):
244+
return len(self.indices)
245+
246+
247+
# ---------------------------------------------------------------------------
248+
# Loader factories
249+
# ---------------------------------------------------------------------------
250+
251+
def get_bbh_loaders(
252+
task_name: str,
253+
tokenizer,
254+
batch_size: int = 16,
255+
max_len: int = 512,
256+
train_ratio: float = 0.8,
257+
num_workers: int = 0,
258+
num_choices: int = None,
259+
) -> dict:
260+
"""Create train/val DataLoaders for a single BBH task."""
261+
dataset = BBHDataset(task_name, tokenizer, max_len, num_choices=num_choices)
262+
263+
n = len(dataset)
264+
n_train = int(n * train_ratio)
265+
n_val = n - n_train
266+
267+
generator = torch.Generator().manual_seed(42)
268+
train_ds, val_ds = torch.utils.data.random_split(
269+
dataset, [n_train, n_val], generator=generator,
270+
)
271+
272+
return {
273+
'train': DataLoader(train_ds, batch_size=batch_size, shuffle=True,
274+
num_workers=num_workers),
275+
'val': DataLoader(val_ds, batch_size=batch_size, shuffle=False,
276+
num_workers=num_workers),
277+
'num_choices': dataset.num_choices,
278+
}
279+
280+
281+
def get_curriculum_loaders(
282+
task_names: list,
283+
tokenizer,
284+
max_len: int = 512,
285+
train_ratio: float = 0.8,
286+
) -> dict:
287+
"""Load all tasks into a single curriculum dataset with train/val split.
288+
289+
Returns a dict with dataset objects and tier metadata. The experiment
290+
script builds DataLoaders on the fly with CurriculumSampler.
291+
"""
292+
dataset = BBHCurriculumDataset(task_names, tokenizer, max_len)
293+
294+
n = len(dataset)
295+
n_train = int(n * train_ratio)
296+
n_val = n - n_train
297+
298+
generator = torch.Generator().manual_seed(42)
299+
train_ds, val_ds = torch.utils.data.random_split(
300+
dataset, [n_train, n_val], generator=generator,
301+
)
302+
303+
# Map tiers through the subset indices
304+
train_tiers = [dataset.tiers[i] for i in train_ds.indices]
305+
val_tiers = [dataset.tiers[i] for i in val_ds.indices]
306+
307+
return {
308+
'full_dataset': dataset,
309+
'train_dataset': train_ds,
310+
'val_dataset': val_ds,
311+
'train_tiers': train_tiers,
312+
'val_tiers': val_tiers,
313+
'max_choices': dataset.max_choices,
314+
'task_names': dataset.task_names,
315+
'task_num_choices': dataset.task_num_choices,
316+
}

0 commit comments

Comments
 (0)