From 5ce2371f6a34b08704a6b46429454060c5719bea Mon Sep 17 00:00:00 2001 From: Alexander Rubinstein Date: Sun, 29 Mar 2026 09:41:03 +0200 Subject: [PATCH] - Update installation commands for DISCO example - Simplify log messages for DISCO example - Update example logs in README for DISCO example --- examples/mmlu_benchmark/README.md | 39 +++++++++++++++++------ examples/mmlu_benchmark/mmlu_benchmark.py | 9 ++---- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/examples/mmlu_benchmark/README.md b/examples/mmlu_benchmark/README.md index 0e90291b..bd0fd039 100644 --- a/examples/mmlu_benchmark/README.md +++ b/examples/mmlu_benchmark/README.md @@ -1,19 +1,27 @@ # MMLU Benchmark Example -Evaluate language models on [MMLU (Massive Multitask Language Understanding)](https://arxiv.org/abs/2009.03300) with optional efficient evaluation via [DISCO](https://arxiv.org/abs/2510.07959). +Evaluate language models on [MMLU (Massive Multitask Language Understanding)](https://arxiv.org/abs/2009.03300) with optional efficient evaluation via [DISCO (Diversifying Sample Condensation)](https://arxiv.org/abs/2510.07959). ## Installation -For basic MMLU evaluation: +Install [uv package manager](https://docs.astral.sh/uv/) as described [here](https://docs.astral.sh/uv/getting-started/installation/). + +Create Python environment: + +```bash +uv venv --python 3.11 +``` + +Install dependencies for basic MMLU evaluation: ```bash -uv pip install .[mmlu] +uv sync --extra mmlu ``` -For DISCO prediction (includes DISCO dependencies): +Install dependencies for MMLU evaluation with DISCO: ```bash -uv pip install .[disco] +uv sync --extra disco ``` ## Run without DISCO (full evaluation) @@ -31,9 +39,8 @@ Full evaluation results look like: Results Summary (Evaluated Tasks) ================================================================================ Total tasks: 14042 -Correct: 8291 -Accuracy (on anchor points): 0.5904 -Accuracy norm (on anchor points): 0.5904 +Correct: 8292 +Accuracy: 0.5905 ``` ## Run with DISCO (predicted full-benchmark score) @@ -47,10 +54,24 @@ uv run python examples/mmlu_benchmark/mmlu_benchmark.py --model_id alignment-han Predicted score output: ``` +================================================================================ +Results Summary (Evaluated Tasks) +================================================================================ +Total tasks: 100 +Correct: 36 +Accuracy: 0.3600 + +================================================================================ +DISCO Prediction +================================================================================ +Computing embeddings and predicting full benchmark accuracy... +Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 19171.53it/s] + Using: DISCO predictor from Hugging Face (arubique/DISCO-MMLU) + ---------------------------------------- DISCO Predicted Full Benchmark Accuracy: ---------------------------------------- - Model 0: 0.606739 + Model 0 (alignment-handbook/zephyr-7b-sft-full): 0.602309 ``` ## Arguments diff --git a/examples/mmlu_benchmark/mmlu_benchmark.py b/examples/mmlu_benchmark/mmlu_benchmark.py index 101aeeba..0e4bbc16 100644 --- a/examples/mmlu_benchmark/mmlu_benchmark.py +++ b/examples/mmlu_benchmark/mmlu_benchmark.py @@ -258,8 +258,6 @@ def extract_eval_entries(res): print(f"Saved predictions tensor to {output_path}") print(f" Shape: {predictions.shape}") print(f" Dtype: {predictions.dtype}") - else: - print(f"Built predictions tensor with shape: {predictions.shape}") return predictions @@ -723,8 +721,7 @@ def main(): print("=" * 80) print(f"Total tasks: {metrics['total_tasks']}") print(f"Correct: {metrics['correct_count']}") - print(f"Accuracy (on anchor points): {metrics['acc']:.4f}") - print(f"Accuracy norm (on anchor points): {metrics['acc_norm']:.4f}") + print(f"Accuracy: {metrics['acc']:.4f}") # Build predictions tensor for DISCO predictions = None @@ -754,8 +751,8 @@ def main(): print("\n" + "-" * 40) print("DISCO Predicted Full Benchmark Accuracy:") print("-" * 40) - for model_idx, acc in disco_results["predicted_accuracies"].items(): - print(f" Model {model_idx}: {acc:.6f}") + for model_idx, acc in sorted(disco_results["predicted_accuracies"].items()): + print(f" Model {model_idx} ({args.model_id}): {acc:.6f}") # Save summary summary_data = {