From 6ce1492b35d2c5042752e953019dfe84beadd7ea Mon Sep 17 00:00:00 2001
From: "qingxu.fu" <fuqingxu.fqx@alibaba-inc.com>
Date: Thu, 12 Mar 2026 16:42:04 +0800
Subject: [PATCH 1/4] deep-fin-pre-commit-patch

---
 .pre-commit-config.yaml                           |  1 +
 .../example_deep_finance/deep_finance_judge.py    | 15 ++++++++-------
 .../example_deep_finance/judge/cgcv/json_utils.py |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6b97f95..15cebb4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,6 +5,7 @@ repos:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
+        exclude: ^tutorial/example_deep_finance/
       - id: check-added-large-files
       - id: check-ast
       - id: check-json
diff --git a/tutorial/example_deep_finance/deep_finance_judge.py b/tutorial/example_deep_finance/deep_finance_judge.py
index 8a8e354..071eccf 100644
--- a/tutorial/example_deep_finance/deep_finance_judge.py
+++ b/tutorial/example_deep_finance/deep_finance_judge.py
@@ -200,7 +200,7 @@ def _load(path, key):
         _load(train_ref_ans_path, "train")
         _load(val_ref_ans_path, "val")
 
-    def _get_reference_data(self, task_id: str) -> Tuple[str, str]:
+    def _get_reference_data(self, task_id: str) -> Tuple[str, str | None]:
         """获取任务的参考答案和领域"""
         cache_key = "val" if task_id.startswith("val_") else "train"
         ans = DeepFinanceJudgeByOpenJudge._ref_answers_cache.get(cache_key, {}).get(task_id, "")
@@ -301,8 +301,8 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO
 
             # 1. 提取输入数据
             history = metadata.get("conversation_history", [])
-            query = metadata.get("query") or getattr(workflow_task.task, "main_query", "")
-            task_id = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "")
+            query: str = metadata.get("query") or getattr(workflow_task.task, "main_query", "")
+            task_id: str = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "")
             rubrics = metadata.get("rubrics")  # 可能是 None 或 list of dicts
             step_reward = metadata.get("reward_stats", {}).get("step_reward", 0.0)
             chat_date = metadata.get("chat_date") if metadata else datetime.now().strftime("%Y-%m-%d")
@@ -318,7 +318,7 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO
             # RM Gallery 耗时记录
             rm_start_time = time.time()
             if self._rm_enabled and self.rm_evaluator:
-                rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain)
+                rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain or "")
             else:
                 rm_raw = 0.0
             rm_time = time.time() - rm_start_time
@@ -788,19 +788,20 @@ def _save_evaluation_log(self, task_id: str, grader_results: Dict[str, List[Any]
         保存 OpenJudge 评估日志（可选）
         """
         try:
+            grader_results_log: Dict[str, List[Dict[str, Any]]] = {}
             log = {
                 "task_id": task_id,
                 "query": query,
                 "timestamp": datetime.now().isoformat(),
-                "grader_results": {}
+                "grader_results": grader_results_log
             }
 
             # 简化 grader_results 以便序列化
             for grader_name, score_list in grader_results.items():
-                log["grader_results"][grader_name] = []
+                grader_results_log[grader_name] = []
                 for score in score_list:
                     if hasattr(score, "score"):
-                        log["grader_results"][grader_name].append({
+                        grader_results_log[grader_name].append({
                             "score": score.score,
                             "reason": score.reason[:200] if hasattr(score, "reason") else "",
                         })
diff --git a/tutorial/example_deep_finance/judge/cgcv/json_utils.py b/tutorial/example_deep_finance/judge/cgcv/json_utils.py
index 7301401..fe6c810 100644
--- a/tutorial/example_deep_finance/judge/cgcv/json_utils.py
+++ b/tutorial/example_deep_finance/judge/cgcv/json_utils.py
@@ -299,7 +299,7 @@ def validate_cgcv_schema(obj: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]],
         # 验证 status
         if normalized["status"] not in VALID_STATUSES:
             # 尝试模糊匹配
-            status_lower = normalized["status"]
+            status_lower: str = normalized["status"]
             matched = False
             for valid_status in VALID_STATUSES:
                 if valid_status in status_lower or status_lower in valid_status:

From 9707fafcd362ea68c28cab45781045558a7edc54 Mon Sep 17 00:00:00 2001
From: "qingxu.fu" <fuqingxu.fqx@alibaba-inc.com>
Date: Fri, 13 Mar 2026 16:04:29 +0800
Subject: [PATCH 2/4] revise openclaw training

---
 .gitignore                                    |   1 +
 .../opencode_build_openclaw_agent/README.md   |  46 +++++-
 .../on_compute_relative_reward.py             | 138 ++++++++++++++----
 .../test_reward.py                            |  93 ++++++++++++
 4 files changed, 249 insertions(+), 29 deletions(-)
 create mode 100644 tutorial/opencode_build_openclaw_agent/test_reward.py

diff --git a/.gitignore b/.gitignore
index db79fdf..00da513 100644
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,4 @@ werewolves_swarm
 .claude
 tensorboard_log
 tutorial/**/*.json
+node_modules
diff --git a/tutorial/opencode_build_openclaw_agent/README.md b/tutorial/opencode_build_openclaw_agent/README.md
index 5c69a53..eefb51d 100644
--- a/tutorial/opencode_build_openclaw_agent/README.md
+++ b/tutorial/opencode_build_openclaw_agent/README.md
@@ -75,8 +75,19 @@ In a new terminal:
 
 ```bash
 cd tutorial/opencode_build_openclaw_agent
+
+# Option 1: Use OpenJudge pointwise grading (default)
+export AJET_SWARM_URL="http://localhost:10086"
+export NUM_REPEAT=4
+export REWARD_MODE=pointwise
+export DASHSCOPE_API_KEY=your_api_key_here
+python fake_vllm_endpoint.py
+
+# Option 2: Use OpenJudge listwise ranking
 export AJET_SWARM_URL="http://localhost:10086"
 export NUM_REPEAT=4
+export REWARD_MODE=listwise
+export DASHSCOPE_API_KEY=your_api_key_here
 python fake_vllm_endpoint.py
 ```
 
@@ -113,13 +124,40 @@ Key parameters in `fake_vllm_endpoint.py`:
 - `num_repeat=4` - GRPO N parameter (responses per query)
 - `model` - Base model path
 
+Environment variables for reward computation:
+
+- `REWARD_MODE` - Reward computation mode: `pointwise` (default) or `listwise`
+- `DASHSCOPE_API_KEY` - API key for OpenJudge LLM grader
+- `JUDGE_BASE_URL` - Base URL for judge model API (default: DashScope)
+- `JUDGE_MODEL` - Judge model name (default: `qwen-plus`)
+
 ## Reward Function
 
-The `ExtraversionGrader` evaluates responses on a 1-10 scale:
-- 1 = Highly introverted (reserved, quiet)
-- 10 = Highly extraverted (energetic, enthusiastic)
+Two OpenJudge-based reward modes are available:
+
+### 1. Pointwise Mode (Default)
 
-Scores are normalized to [-1, 1] for GRPO training.
+Uses OpenJudge LLM grader to evaluate each response independently:
+- Evaluates extraversion traits on 1-10 scale
+- Provides detailed reasoning for each score
+- Scores normalized to [-1, 1] for GRPO training
+
+```bash
+export REWARD_MODE=pointwise
+export DASHSCOPE_API_KEY=your_api_key_here
+```
+
+### 2. Listwise Mode
+
+Uses OpenJudge to rank all responses together:
+- Compares responses directly against each other
+- Produces relative rankings
+- Best for capturing subtle differences
+
+```bash
+export REWARD_MODE=listwise
+export DASHSCOPE_API_KEY=your_api_key_here
+```
 
 ## Monitoring
 
diff --git a/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py b/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
index ea7c164..5bafd2f 100644
--- a/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
+++ b/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
@@ -1,41 +1,129 @@
 # -*- coding: utf-8 -*-
-"""Compute relative rewards based on extraversion personality alignment."""
+"""Compute relative rewards based on extraversion personality alignment using OpenJudge."""
 
+import os
 from typing import List, Dict
 from beast_logger import print_listofdict
+from openjudge.graders.base_grader import GraderMode, GraderScore, GraderRank
+from openjudge.graders.llm_grader import LLMGrader
+from openjudge.models import OpenAIChatModel
 
-def score_extraversion(response_text: str) -> float:
-    """Score response for extraversion traits (1-10 scale)."""
-    extraversion_keywords = [
-        'excited', 'love', 'amazing', 'awesome', 'fantastic', 'great',
-        'wonderful', 'thrilled', 'energetic', 'enthusiastic', 'fun',
-        'social', 'outgoing', 'active', 'lively', 'vibrant', 'happy',
-        'enjoy', 'delighted', 'cheerful', 'positive'
-    ]
+# Configuration
+REWARD_MODE = os.getenv("REWARD_MODE", "pointwise")  # Options: pointwise, listwise
+API_KEY = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
+BASE_URL = os.getenv("JUDGE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
+JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen-plus")
 
-    text_lower = response_text.lower()
-    score = 5.0
+# OpenJudge grader setup
+judge_model = OpenAIChatModel(
+    model=JUDGE_MODEL,
+    api_key=API_KEY,
+    base_url=BASE_URL,
+)
 
-    for keyword in extraversion_keywords:
-        if keyword in text_lower:
-            score += 0.5
+EXTRAVERSION_PROMPT = """You are evaluating responses for extraversion personality traits.
 
-    score += min(response_text.count('!') * 0.3, 2.0)
+Extraversion characteristics include:
+- Outgoing, energetic, enthusiastic tone
+- Social engagement and excitement
+- Positive, upbeat language
+- Action-oriented expressions
+- Use of exclamation marks and emotional words
 
-    if len(response_text) < 50:
-        score -= 1.0
+Rate the response on a scale of 0.0-1.0:
+0.0 = Highly introverted (reserved, quiet, minimal emotion)
+1.0 = Highly extraverted (energetic, enthusiastic, very expressive)
 
-    return max(1.0, min(10.0, score))
+Question: {question}
+Response: {response}
 
-async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
-    """Compute relative rewards for extraversion alignment."""
+Return a json object with exactly two fields:
+- "score": float between 0.0 and 1.0
+- "reason": brief explanation"""
+
+def build_listwise_template(n: int) -> str:
+    """Build a listwise prompt template for n responses."""
+    answers_block = "\n".join([f"{i+1}. {{answer_{i+1}}}" for i in range(n)])
+    return f"""You are ranking multiple responses based on extraversion personality traits.
+
+Extraversion characteristics include:
+- Outgoing, energetic, enthusiastic tone
+- Social engagement and excitement
+- Positive, upbeat language
+- Action-oriented expressions
+
+Question: {{question}}
+
+Responses to rank:
+{answers_block}
+
+Rank these responses from most extraverted to least extraverted.
+Return a json object with exactly two fields:
+- "rank": list of integers (1-indexed) ordered from most to least extraverted, e.g. [2, 1, 3]
+- "reason": brief explanation of the ranking"""
+
+pointwise_grader = LLMGrader(
+    name="extraversion_pointwise",
+    mode=GraderMode.POINTWISE,
+    description="Evaluate extraversion traits",
+    model=judge_model,
+    template=EXTRAVERSION_PROMPT,
+)
+
+
+async def compute_pointwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
+    """Compute rewards using OpenJudge pointwise grading."""
     scores = []
     for answer in all_answers:
         content = answer.get("content", "")
-        raw_score = score_extraversion(content)
-        normalized = (raw_score - 5.5) / 4.5
-        scores.append(normalized)
-        answer["reward"] = normalized
+        result = await pointwise_grader.aevaluate(question=question, response=content)
+        if isinstance(result, GraderScore):
+            # score is already normalized 0-1 by OpenJudge
+            score = result.score
+        else:
+            score = 0.0
+        scores.append(score)
+        answer["reward"] = score
+    return scores
+
+
+async def compute_listwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
+    """Compute rewards using OpenJudge listwise ranking."""
+    n = len(all_answers)
+    template = build_listwise_template(n)
+    grader = LLMGrader(
+        name="extraversion_listwise",
+        mode=GraderMode.LISTWISE,
+        description="Rank responses by extraversion",
+        model=judge_model,
+        template=template,
+    )
+    kwargs = {"question": question}
+    for i, ans in enumerate(all_answers):
+        kwargs[f"answer_{i+1}"] = ans.get("content", "")
+
+    result = await grader.aevaluate(**kwargs)
+
+    scores = [0.0] * n
+    if isinstance(result, GraderRank):
+        # rank is a list of 1-indexed positions ordered best to worst
+        # convert to reward: rank 1 (best) -> 1.0, rank n (worst) -> 0.0
+        for position, idx in enumerate(result.rank):
+            scores[idx - 1] = 1.0 - (position / (n - 1)) if n > 1 else 0.5
+
+    for answer, score in zip(all_answers, scores):
+        answer["reward"] = score
+    return scores
+
+
+async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
+    """Compute relative rewards for extraversion alignment."""
+    question = valid_results[0].get("question", "") if valid_results else ""
+
+    if REWARD_MODE == "listwise":
+        scores = await compute_listwise_rewards(question, all_answers)
+    else:  # pointwise (default)
+        scores = await compute_pointwise_rewards(question, all_answers)
 
-    print_listofdict(all_answers, header="on_compute_relative_reward")
+    print_listofdict(all_answers, header=f"on_compute_relative_reward (mode={REWARD_MODE})")
     return scores
diff --git a/tutorial/opencode_build_openclaw_agent/test_reward.py b/tutorial/opencode_build_openclaw_agent/test_reward.py
new file mode 100644
index 0000000..a731b25
--- /dev/null
+++ b/tutorial/opencode_build_openclaw_agent/test_reward.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""Test script for on_compute_relative_reward.py using real OpenJudge API."""
+
+import asyncio
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
+
+
+async def test_pointwise():
+    """Test pointwise reward mode with real API."""
+    print("\n=== Testing Pointwise Mode (real API) ===")
+    os.environ["REWARD_MODE"] = "pointwise"
+
+    import importlib
+    import on_compute_relative_reward as mod
+    importlib.reload(mod)
+
+    valid_results = [{"question": "What are your thoughts on Paris?"}]
+    all_answers = [
+        {"content": "I'm so excited about Paris! It's amazing and wonderful!"},
+        {"content": "Paris is a city in France."},
+        {"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
+    ]
+
+    try:
+        scores = await mod.on_compute_relative_reward(valid_results, all_answers)
+        print(f"Scores: {scores}")
+        assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
+        assert all(isinstance(s, float) for s in scores), "All scores should be floats"
+        # extraverted responses should score higher than neutral
+        assert scores[0] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
+        assert scores[2] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
+        print("✓ Pointwise mode test passed")
+        return True
+    except Exception as e:
+        print(f"✗ Pointwise mode test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+async def test_listwise():
+    """Test listwise reward mode with real API."""
+    print("\n=== Testing Listwise Mode (real API) ===")
+    os.environ["REWARD_MODE"] = "listwise"
+
+    import importlib
+    import on_compute_relative_reward as mod
+    importlib.reload(mod)
+
+    valid_results = [{"question": "What are your thoughts on Paris?"}]
+    all_answers = [
+        {"content": "I'm so excited about Paris! It's amazing and wonderful!"},
+        {"content": "Paris is a city in France."},
+        {"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
+    ]
+
+    try:
+        scores = await mod.on_compute_relative_reward(valid_results, all_answers)
+        print(f"Scores: {scores}")
+        assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
+        assert all(isinstance(s, float) for s in scores), "All scores should be floats"
+        # neutral response should score lowest
+        assert scores[1] < scores[0] or scores[1] < scores[2], \
+            f"Neutral response should score lower than at least one extraverted response: {scores}"
+        print("✓ Listwise mode test passed")
+        return True
+    except Exception as e:
+        print(f"✗ Listwise mode test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+async def main():
+    print("Testing on_compute_relative_reward.py (real API)")
+    print("=" * 50)
+
+    results = []
+    results.append(await test_pointwise())
+    results.append(await test_listwise())
+
+    print("\n" + "=" * 50)
+    print(f"Tests passed: {sum(results)}/{len(results)}")
+    return all(results)
+
+
+if __name__ == "__main__":
+    success = asyncio.run(main())
+    sys.exit(0 if success else 1)

From b6da77fe48b4431b04ac24a2439b6bb7b9b231d6 Mon Sep 17 00:00:00 2001
From: "qingxu.fu" <fuqingxu.fqx@alibaba-inc.com>
Date: Fri, 13 Mar 2026 16:16:16 +0800
Subject: [PATCH 3/4] add illustration

---
 docs/en/example_train_multi_model.md    | 3 +++
 docs/en/example_train_multi_model.zh.md | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/docs/en/example_train_multi_model.md b/docs/en/example_train_multi_model.md
index a062ac0..e55d49d 100644
--- a/docs/en/example_train_multi_model.md
+++ b/docs/en/example_train_multi_model.md
@@ -90,6 +90,9 @@ graph TB
     C -->|end_episode + reward_14b| S2
 ```
 
+![alt text](https://img.alicdn.com/imgextra/i3/O1CN01vHfNt41LRcQeDMjE4_!!6000000001296-2-tps-1408-768.png)
+
+
 **Architecture Explanation**:
 
 - **Swarm Server 1 (Port 10086)**: Hosts the 7B model, responsible for Agent 1 and Agent 3's inference and training
diff --git a/docs/en/example_train_multi_model.zh.md b/docs/en/example_train_multi_model.zh.md
index 772a84f..8e74c6b 100644
--- a/docs/en/example_train_multi_model.zh.md
+++ b/docs/en/example_train_multi_model.zh.md
@@ -88,6 +88,9 @@ graph TB
     C -->|end_episode + reward_14b| S2
 ```
 
+![alt text](https://img.alicdn.com/imgextra/i3/O1CN01vHfNt41LRcQeDMjE4_!!6000000001296-2-tps-1408-768.png)
+
+
 **架构说明**：
 
 - **Swarm Server 1 (端口 10086)**：承载 7B 模型，负责 Agent 1 和 Agent 3 的推理与训练
@@ -176,6 +179,8 @@ sequenceDiagram
 4. 将各自的奖励汇报给对应的 Swarm Server
 5. 两个 Server 独立执行策略梯度更新
 
+
+
 ## 训练曲线
 
 ![alt text](https://img.alicdn.com/imgextra/i2/O1CN0161wtDk1zZwFmIX15x_!!6000000006729-2-tps-2978-1413.png)

From f091efcf8243d1ff58f4db918b32c4eb3de34b00 Mon Sep 17 00:00:00 2001
From: "qingxu.fu" <fuqingxu.fqx@alibaba-inc.com>
Date: Thu, 19 Mar 2026 18:05:56 +0800
Subject: [PATCH 4/4] add better reward for openclaw agent build

---
 .../cheatsheet.md                             |  47 +++
 .../fake_vllm_endpoint.py                     |  14 +-
 .../on_compute_relative_reward.py             | 298 ++++++++++++++++--
 .../on_user_submit_new_requests.py            |  42 ++-
 .../test_reward.py                            | 283 ++++++++++++++---
 5 files changed, 609 insertions(+), 75 deletions(-)
 create mode 100644 tutorial/opencode_build_openclaw_agent/cheatsheet.md

diff --git a/tutorial/opencode_build_openclaw_agent/cheatsheet.md b/tutorial/opencode_build_openclaw_agent/cheatsheet.md
new file mode 100644
index 0000000..0d79b05
--- /dev/null
+++ b/tutorial/opencode_build_openclaw_agent/cheatsheet.md
@@ -0,0 +1,47 @@
+# OpenClaw Reward Cheatsheet
+
+## Run the test
+
+```bash
+cd agentjet/tutorial/opencode_build_openclaw_agent
+
+# pointwise (default)
+DASHSCOPE_API_KEY=your_key python test_reward.py
+
+# listwise
+REWARD_MODE=listwise DASHSCOPE_API_KEY=your_key python test_reward.py
+```
+
+## Run the training endpoint
+
+```bash
+# pointwise (default)
+AJET_SWARM_URL=http://localhost:10086 \
+DASHSCOPE_API_KEY=your_key \
+REWARD_MODE=pointwise \
+python fake_vllm_endpoint.py
+
+# listwise
+AJET_SWARM_URL=http://localhost:10086 \
+DASHSCOPE_API_KEY=your_key \
+REWARD_MODE=listwise \
+python fake_vllm_endpoint.py
+```
+
+## Reward modes
+
+| Mode | Description |
+|------|-------------|
+| `pointwise` | Each response scored independently (0.0–1.0) |
+| `listwise` | All responses ranked together (best=1.0, worst=0.0) |
+
+## Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `REWARD_MODE` | `pointwise` | `pointwise` or `listwise` |
+| `DASHSCOPE_API_KEY` | — | DashScope API key (required) |
+| `JUDGE_MODEL` | `qwen-plus` | Judge model name |
+| `JUDGE_BASE_URL` | DashScope endpoint | Judge model base URL |
+| `AJET_SWARM_URL` | `http://localhost:10086` | Swarm server URL |
+| `NUM_REPEAT` | `4` | GRPO N (responses per query) |
diff --git a/tutorial/opencode_build_openclaw_agent/fake_vllm_endpoint.py b/tutorial/opencode_build_openclaw_agent/fake_vllm_endpoint.py
index 0831cd2..e73cc80 100644
--- a/tutorial/opencode_build_openclaw_agent/fake_vllm_endpoint.py
+++ b/tutorial/opencode_build_openclaw_agent/fake_vllm_endpoint.py
@@ -25,7 +25,7 @@
 import sys
 sys.path.insert(0, os.path.dirname(__file__))
 
-from on_user_submit_new_requests import on_user_submit_new_requests
+from on_user_submit_new_requests import on_user_submit_new_requests, get_query_history
 from on_compute_relative_reward import on_compute_relative_reward
 
 # Configuration
@@ -91,6 +91,14 @@ async def proxy_chat_completion(base_url: str, api_key: str, request: Request, i
     json_data = await request.json()
     json_data["stream"] = is_stream
 
+    # Remove fields not supported by vLLM to avoid warnings
+    UNSUPPORTED_FIELDS = {"strict", "store"}
+    for field in UNSUPPORTED_FIELDS:
+        json_data.pop(field, None)
+    # Also remove 'strict' from response_format if present
+    if "response_format" in json_data and isinstance(json_data["response_format"], dict):
+        json_data["response_format"].pop("strict", None)
+
     async with httpx.AsyncClient(timeout=300.0) as client:
         resp = await client.post(f"{base_url}/chat/completions", json=json_data, headers=headers)
         resp.raise_for_status()
@@ -200,7 +208,7 @@ async def handle_one2many_request(request: Request, request_id: str) -> Dict | L
 
     valid_results = await run_all_episodes(request, is_stream)
     all_answers = [extract_assistant_message(r.response) for r in valid_results]
-    rewards = await on_compute_relative_reward(valid_results, all_answers)
+    rewards = await on_compute_relative_reward(valid_results, all_answers, question=user_query)
 
     await finalize_episodes(task, valid_results, rewards)
 
@@ -259,7 +267,7 @@ async def health_check():
 @app.get("/requests")
 async def get_requests():
     """Get all recorded user requests."""
-    return {"requests": USER_REQUEST_RECORD}
+    return {"requests": get_query_history()}
 
 
 if __name__ == "__main__":
diff --git a/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py b/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
index 5bafd2f..53894a9 100644
--- a/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
+++ b/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
@@ -1,26 +1,55 @@
 # -*- coding: utf-8 -*-
-"""Compute relative rewards based on extraversion personality alignment using OpenJudge."""
+"""Compute relative rewards based on extraversion, relevance, diversity, and repetition quality."""
 
 import os
+import collections
 from typing import List, Dict
+
+from loguru import logger
 from beast_logger import print_listofdict
 from openjudge.graders.base_grader import GraderMode, GraderScore, GraderRank
 from openjudge.graders.llm_grader import LLMGrader
+from openjudge.graders.common.relevance import RelevanceGrader
+from openjudge.graders.format.ngram_repetition_penalty import NgramRepetitionPenaltyGrader
 from openjudge.models import OpenAIChatModel
+try:
+    from ajet.utils.compute_madness import has_repeat
+except ImportError:
+    # Fallback: when running outside the full ajet package (e.g. tests),
+    # resolve relative to the repo root.
+    import sys as _sys
+    from pathlib import Path as _Path
+    _repo_root = str(_Path(__file__).resolve().parents[2])
+    if _repo_root not in _sys.path:
+        _sys.path.insert(0, _repo_root)
+    from ajet.utils.compute_madness import has_repeat
 
+# ---------------------------------------------------------------------------
 # Configuration
-REWARD_MODE = os.getenv("REWARD_MODE", "pointwise")  # Options: pointwise, listwise
+# ---------------------------------------------------------------------------
+REWARD_MODE = os.getenv("REWARD_MODE", "pointwise")  # pointwise | listwise
 API_KEY = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
 BASE_URL = os.getenv("JUDGE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
 JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen-plus")
 
-# OpenJudge grader setup
+# Reward weights (must sum to 1.0)
+W_EXTRAVERSION = float(os.getenv("W_EXTRAVERSION", "0.5"))
+W_RELEVANCE = float(os.getenv("W_RELEVANCE", "0.3"))
+W_DIVERSITY = float(os.getenv("W_DIVERSITY", "0.2"))
+
+# Cross-request history buffer size
+HISTORY_MAX_SIZE = int(os.getenv("DIVERSITY_HISTORY_SIZE", "25"))
+
+# ---------------------------------------------------------------------------
+# Shared model & graders
+# ---------------------------------------------------------------------------
 judge_model = OpenAIChatModel(
     model=JUDGE_MODEL,
     api_key=API_KEY,
     base_url=BASE_URL,
 )
 
+# --- Extraversion grader (custom LLM prompt) ---
 EXTRAVERSION_PROMPT = """You are evaluating responses for extraversion personality traits.
 
 Extraversion characteristics include:
@@ -41,6 +70,153 @@
 - "score": float between 0.0 and 1.0
 - "reason": brief explanation"""
 
+pointwise_grader = LLMGrader(
+    name="extraversion_pointwise",
+    mode=GraderMode.POINTWISE,
+    description="Evaluate extraversion traits",
+    model=judge_model,
+    template=EXTRAVERSION_PROMPT,
+)
+
+# --- Relevance grader (built-in OpenJudge) ---
+relevance_grader = RelevanceGrader(model=judge_model)
+
+# --- Repetition penalty grader (deterministic, no LLM) ---
+# Detects n-gram repetition within a single response.
+# Returns score in [0, 1] where 1 = no repetition, 0 = heavily repetitive.
+repetition_grader = NgramRepetitionPenaltyGrader(
+    n=4,                    # 4-gram detection
+    penalty_threshold=0.15, # trigger penalty when >15% of n-grams are repeated
+    use_soft_penalty=True,  # gradual penalty rather than cliff
+    max_penalty=-1.0,       # worst case: score becomes 0
+    min_scaling=0.0,        # at max penalty, multiplier goes to 0
+)
+
+# ---------------------------------------------------------------------------
+# In-process history of recent responses (for cross-request diversity)
+# ---------------------------------------------------------------------------
+_response_history: List[str] = []
+
+
+def record_responses_to_history(contents: List[str]) -> None:
+    """Append new responses to the rolling history buffer."""
+    _response_history.extend(contents)
+    # Trim to keep only the most recent entries
+    while len(_response_history) > HISTORY_MAX_SIZE:
+        _response_history.pop(0)
+
+
+# ---------------------------------------------------------------------------
+# Diversity: n-gram overlap (fast, deterministic, no LLM needed)
+# ---------------------------------------------------------------------------
+def _get_ngrams(text: str, n: int = 3) -> collections.Counter:
+    """Extract character-level n-grams from text."""
+    tokens = text.lower().split()
+    if len(tokens) < n:
+        return collections.Counter(tokens)
+    return collections.Counter(
+        tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)
+    )
+
+
+def _ngram_overlap(text_a: str, text_b: str, n: int = 3) -> float:
+    """Compute Jaccard overlap of n-grams between two texts. Returns 0-1."""
+    ngrams_a = _get_ngrams(text_a, n)
+    ngrams_b = _get_ngrams(text_b, n)
+    if not ngrams_a or not ngrams_b:
+        return 0.0
+    intersection = sum((ngrams_a & ngrams_b).values())
+    union = sum((ngrams_a | ngrams_b).values())
+    return intersection / union if union > 0 else 0.0
+
+
+def compute_diversity_scores(contents: List[str], history: List[str]) -> List[float]:
+    """
+    Compute a diversity score for each response (0 = duplicate, 1 = fully unique).
+
+    Two components:
+      1. Within-batch: average pairwise n-gram overlap with other responses in the batch
+      2. Cross-request: max n-gram overlap with any response in the history buffer
+
+    Final diversity_score = 1 - max(within_batch_overlap, cross_request_overlap)
+    """
+    n = len(contents)
+    scores = []
+    for i, content_i in enumerate(contents):
+        # Within-batch overlap: average overlap with other responses in this batch
+        if n > 1:
+            batch_overlaps = [
+                _ngram_overlap(content_i, contents[j])
+                for j in range(n)
+                if j != i
+            ]
+            within_batch = max(batch_overlaps)  # worst-case overlap within batch
+        else:
+            within_batch = 0.0
+
+        # Cross-request overlap: max overlap with any historical response
+        if history:
+            cross_request = max(_ngram_overlap(content_i, h) for h in history)
+        else:
+            cross_request = 0.0
+
+        overlap = max(within_batch, cross_request)
+        scores.append(1.0 - overlap)
+
+    return scores
+
+
+# ---------------------------------------------------------------------------
+# Quality gate: repetition & degeneration detection (deterministic)
+# ---------------------------------------------------------------------------
+async def compute_quality_scores(contents: List[str]) -> List[float]:
+    """
+    Compute a quality multiplier for each response (0 = degenerate, 1 = clean).
+
+    Combines two signals:
+      1. NgramRepetitionPenaltyGrader — detects looping/repeated n-gram blocks
+      2. compute_string_madness — catches nonsense chars, special token leaks,
+         character-level repetition
+
+    Returns a score in [0, 1] that will be used as a *multiplier* on the
+    composite reward, so degenerate outputs get crushed to near-zero.
+    """
+    scores = []
+    for content in contents:
+        # --- Signal 1: n-gram repetition (OpenJudge) ---
+        try:
+            rep_result = await repetition_grader.aevaluate(response=content)
+            # NgramRepetitionPenaltyGrader returns penalty in [-1, 0]:
+            #   0 = no repetition, -1 = max repetition
+            # Convert to quality: add 1 → [0, 1]
+            ngram_penalty = rep_result.score if isinstance(rep_result, GraderScore) else 0.0
+            ngram_score = 1.0 + ngram_penalty
+        except Exception as e:
+            logger.warning(f"NgramRepetitionPenaltyGrader failed: {e}")
+            ngram_score = 1.0
+
+        # --- Signal 2: string madness (char-level degeneration) ---
+        # Only check for word/char repetition and special token leaks.
+        # We pass checklist=[] to skip the non-ASCII check (accented
+        # characters like é are legitimate), and check repetition manually.
+        madness_score = 1.0  # assume clean
+        if "<|im_start|>" in content:
+            madness_score = 0.0
+        elif has_repeat(content.split(), remember_n_words=5, patience_max=10):
+            madness_score = 0.0
+        elif has_repeat(content, remember_n_words=4, patience_max=200):
+            madness_score = 0.0
+
+        # Combined quality: take the minimum (strictest gate wins)
+        quality = max(0.0, min(1.0, min(ngram_score, madness_score)))
+        scores.append(quality)
+
+    return scores
+
+
+# ---------------------------------------------------------------------------
+# Extraversion scoring (pointwise / listwise)
+# ---------------------------------------------------------------------------
 def build_listwise_template(n: int) -> str:
     """Build a listwise prompt template for n responses."""
     answers_block = "\n".join([f"{i+1}. {{answer_{i+1}}}" for i in range(n)])
@@ -62,33 +238,20 @@ def build_listwise_template(n: int) -> str:
 - "rank": list of integers (1-indexed) ordered from most to least extraverted, e.g. [2, 1, 3]
 - "reason": brief explanation of the ranking"""
 
-pointwise_grader = LLMGrader(
-    name="extraversion_pointwise",
-    mode=GraderMode.POINTWISE,
-    description="Evaluate extraversion traits",
-    model=judge_model,
-    template=EXTRAVERSION_PROMPT,
-)
-
 
-async def compute_pointwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
-    """Compute rewards using OpenJudge pointwise grading."""
+async def compute_pointwise_extraversion(question: str, all_answers: List[Dict]) -> List[float]:
+    """Compute extraversion scores using pointwise grading."""
     scores = []
     for answer in all_answers:
         content = answer.get("content", "")
         result = await pointwise_grader.aevaluate(question=question, response=content)
-        if isinstance(result, GraderScore):
-            # score is already normalized 0-1 by OpenJudge
-            score = result.score
-        else:
-            score = 0.0
+        score = result.score if isinstance(result, GraderScore) else 0.0
         scores.append(score)
-        answer["reward"] = score
     return scores
 
 
-async def compute_listwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
-    """Compute rewards using OpenJudge listwise ranking."""
+async def compute_listwise_extraversion(question: str, all_answers: List[Dict]) -> List[float]:
+    """Compute extraversion scores using listwise ranking."""
     n = len(all_answers)
     template = build_listwise_template(n)
     grader = LLMGrader(
@@ -106,24 +269,93 @@ async def compute_listwise_rewards(question: str, all_answers: List[Dict]) -> Li
 
     scores = [0.0] * n
     if isinstance(result, GraderRank):
-        # rank is a list of 1-indexed positions ordered best to worst
-        # convert to reward: rank 1 (best) -> 1.0, rank n (worst) -> 0.0
         for position, idx in enumerate(result.rank):
             scores[idx - 1] = 1.0 - (position / (n - 1)) if n > 1 else 0.5
+    return scores
+
 
-    for answer, score in zip(all_answers, scores):
-        answer["reward"] = score
+# ---------------------------------------------------------------------------
+# Relevance scoring (built-in OpenJudge RelevanceGrader, score 1-5 → 0-1)
+# ---------------------------------------------------------------------------
+async def compute_relevance_scores(question: str, all_answers: List[Dict]) -> List[float]:
+    """Score how relevant each response is to the question. Returns 0-1."""
+    scores = []
+    for answer in all_answers:
+        content = answer.get("content", "")
+        result = await relevance_grader.aevaluate(query=question, response=content)
+        if isinstance(result, GraderScore):
+            # RelevanceGrader returns 1-5; normalise to 0-1
+            score = (result.score - 1.0) / 4.0
+        else:
+            score = 0.0
+        scores.append(max(0.0, min(1.0, score)))
     return scores
 
 
-async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
-    """Compute relative rewards for extraversion alignment."""
-    question = valid_results[0].get("question", "") if valid_results else ""
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+async def on_compute_relative_reward(
+    valid_results: List,
+    all_answers: List[Dict],
+    question: str = "",
+) -> List[float]:
+    """
+    Compute composite rewards combining extraversion, relevance, diversity,
+    and a quality gate for repetition/degeneration.
+
+    Final reward = quality * (W_EXTRAVERSION * extraversion
+                            + W_RELEVANCE   * relevance
+                            + W_DIVERSITY   * diversity)
 
+    The quality multiplier (0-1) acts as a hard gate: degenerate responses
+    (looping, repeated paragraphs, nonsense characters) get their reward
+    crushed toward zero regardless of other signal scores.
+    """
+    contents = [a.get("content", "") for a in all_answers]
+
+    # 0. Quality gate (deterministic — fast, runs first)
+    quality_scores = await compute_quality_scores(contents)
+
+    # 1. Extraversion score (LLM-based)
     if REWARD_MODE == "listwise":
-        scores = await compute_listwise_rewards(question, all_answers)
-    else:  # pointwise (default)
-        scores = await compute_pointwise_rewards(question, all_answers)
+        extraversion_scores = await compute_listwise_extraversion(question, all_answers)
+    else:
+        extraversion_scores = await compute_pointwise_extraversion(question, all_answers)
 
-    print_listofdict(all_answers, header=f"on_compute_relative_reward (mode={REWARD_MODE})")
-    return scores
+    # 2. Relevance score (LLM-based)
+    relevance_scores = await compute_relevance_scores(question, all_answers)
+
+    # 3. Diversity score (deterministic, n-gram overlap)
+    diversity_scores = compute_diversity_scores(contents, _response_history)
+
+    # Composite reward = quality * weighted_sum
+    final_scores = []
+    for i in range(len(all_answers)):
+        weighted_sum = (
+            W_EXTRAVERSION * extraversion_scores[i]
+            + W_RELEVANCE * relevance_scores[i]
+            + W_DIVERSITY * diversity_scores[i]
+        )
+        composite = quality_scores[i] * weighted_sum
+        final_scores.append(round(composite, 4))
+
+        # Annotate the answer dict for logging
+        all_answers[i]["reward"] = final_scores[i]
+        all_answers[i]["quality"] = round(quality_scores[i], 4)
+        all_answers[i]["extraversion"] = round(extraversion_scores[i], 4)
+        all_answers[i]["relevance"] = round(relevance_scores[i], 4)
+        all_answers[i]["diversity"] = round(diversity_scores[i], 4)
+
+    # Update history buffer with this batch's responses
+    record_responses_to_history(contents)
+
+    print_listofdict(
+        all_answers,
+        header=(
+            f"on_compute_relative_reward (mode={REWARD_MODE}, "
+            f"w_ext={W_EXTRAVERSION}, w_rel={W_RELEVANCE}, w_div={W_DIVERSITY}, "
+            f"quality_gate=multiplicative)"
+        ),
+    )
+    return final_scores
diff --git a/tutorial/opencode_build_openclaw_agent/on_user_submit_new_requests.py b/tutorial/opencode_build_openclaw_agent/on_user_submit_new_requests.py
index 07f32a5..11b7932 100644
--- a/tutorial/opencode_build_openclaw_agent/on_user_submit_new_requests.py
+++ b/tutorial/opencode_build_openclaw_agent/on_user_submit_new_requests.py
@@ -1,8 +1,44 @@
 # -*- coding: utf-8 -*-
-"""Handle new user requests."""
+"""Handle new user requests and track query history for diversity awareness."""
 
+from typing import List, Dict
+from loguru import logger
 from ajet.schema.task import Task
 
+# Rolling buffer of recent queries — used to detect repeated / near-duplicate
+# questions so the system can log warnings.  The response-level diversity
+# signal lives in on_compute_relative_reward._response_history.
+_query_history: List[Dict] = []
+QUERY_HISTORY_MAX = 100
+
+
+def get_query_history() -> List[Dict]:
+    """Return the current query history (read-only copy)."""
+    return list(_query_history)
+
+
 async def on_user_submit_new_requests(request_id: str, task: Task) -> None:
-    """Store user request when submitted."""
-    pass  # No special processing needed for this use case
+    """
+    Store user request metadata when submitted.
+
+    This populates a lightweight in-process history so that:
+    1. The /requests endpoint can expose recent queries for debugging.
+    2. We can detect if the same question keeps appearing, which signals
+       a data distribution issue upstream rather than a model problem.
+    """
+    entry = {
+        "request_id": request_id,
+        "task_id": task.task_id,
+        "query": task.main_query,
+    }
+    _query_history.append(entry)
+
+    # Trim oldest entries
+    while len(_query_history) > QUERY_HISTORY_MAX:
+        _query_history.pop(0)
+
+    logger.info(
+        f"[on_user_submit] request_id={request_id} "
+        f"query_len={len(task.main_query)} "
+        f"history_size={len(_query_history)}"
+    )
diff --git a/tutorial/opencode_build_openclaw_agent/test_reward.py b/tutorial/opencode_build_openclaw_agent/test_reward.py
index a731b25..8b65922 100644
--- a/tutorial/opencode_build_openclaw_agent/test_reward.py
+++ b/tutorial/opencode_build_openclaw_agent/test_reward.py
@@ -1,90 +1,301 @@
 #!/usr/bin/env python3
-"""Test script for on_compute_relative_reward.py using real OpenJudge API."""
+"""Test script for on_compute_relative_reward.py using real OpenJudge API.
+
+Tests four reward dimensions:
+  1. Extraversion — enthusiastic responses score higher
+  2. Relevance — on-topic responses score higher than off-topic
+  3. Diversity — unique responses score higher than near-duplicates
+  4. Quality gate — repetitive/degenerate responses get crushed
+"""
 
 import asyncio
 import sys
 import os
 
 sys.path.insert(0, os.path.dirname(__file__))
-os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
+os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY", "sk-311cfac3a0f94ff4b5ddf401f70fa338")
 
 
-async def test_pointwise():
-    """Test pointwise reward mode with real API."""
-    print("\n=== Testing Pointwise Mode (real API) ===")
+async def test_pointwise_composite():
+    """Test pointwise composite reward (extraversion + relevance + diversity)."""
+    print("\n=== Testing Pointwise Composite Reward ===")
     os.environ["REWARD_MODE"] = "pointwise"
 
     import importlib
     import on_compute_relative_reward as mod
     importlib.reload(mod)
+    mod._response_history.clear()  # fresh history for test isolation
 
-    valid_results = [{"question": "What are your thoughts on Paris?"}]
+    question = "What are your thoughts on Paris?"
     all_answers = [
-        {"content": "I'm so excited about Paris! It's amazing and wonderful!"},
+        {"content": "I'm so excited about Paris! The Eiffel Tower at night is breathtaking and the cafes are amazing!"},
         {"content": "Paris is a city in France."},
-        {"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
+        {"content": "I absolutely love Paris! The energy on the Champs-Élysées is fantastic and so vibrant!"},
     ]
 
     try:
-        scores = await mod.on_compute_relative_reward(valid_results, all_answers)
-        print(f"Scores: {scores}")
+        scores = await mod.on_compute_relative_reward([], all_answers, question=question)
+        print(f"Composite scores: {scores}")
+        for a in all_answers:
+            print(f"  ext={a.get('extraversion')}, rel={a.get('relevance')}, "
+                  f"div={a.get('diversity')}, reward={a.get('reward')}  "
+                  f"content={a['content'][:50]}...")
+
         assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
         assert all(isinstance(s, float) for s in scores), "All scores should be floats"
-        # extraverted responses should score higher than neutral
-        assert scores[0] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
-        assert scores[2] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
-        print("✓ Pointwise mode test passed")
+        # Extraverted + relevant responses should beat the flat neutral one
+        assert scores[0] > scores[1], f"Enthusiastic on-topic should beat neutral: {scores}"
+        assert scores[2] > scores[1], f"Enthusiastic on-topic should beat neutral: {scores}"
+        print("PASSED")
+        return True
+    except Exception as e:
+        print(f"FAILED: {e}")
+        import traceback; traceback.print_exc()
+        return False
+
+
+async def test_relevance_penalty():
+    """Off-topic answers should get lower composite scores than on-topic ones."""
+    print("\n=== Testing Relevance Penalty ===")
+    os.environ["REWARD_MODE"] = "pointwise"
+
+    import importlib
+    import on_compute_relative_reward as mod
+    importlib.reload(mod)
+    mod._response_history.clear()
+
+    question = "What is your favorite food?"
+    all_answers = [
+        # On-topic, extraverted
+        {"content": "Oh my gosh, I absolutely LOVE sushi! The flavors are incredible and I get so excited every time!"},
+        # Off-topic, extraverted (talks about space, not food)
+        {"content": "WOW space exploration is SO exciting! Rockets launching into the sky fills me with energy!!!"},
+    ]
+
+    try:
+        scores = await mod.on_compute_relative_reward([], all_answers, question=question)
+        print(f"Scores: {scores}")
+        for a in all_answers:
+            print(f"  ext={a.get('extraversion')}, rel={a.get('relevance')}, "
+                  f"div={a.get('diversity')}, reward={a.get('reward')}  "
+                  f"content={a['content'][:50]}...")
+
+        # Both are extraverted, but on-topic should win because of relevance
+        assert scores[0] > scores[1], \
+            f"On-topic extraverted should beat off-topic extraverted: {scores}"
+        print("PASSED")
         return True
     except Exception as e:
-        print(f"✗ Pointwise mode test failed: {e}")
-        import traceback
-        traceback.print_exc()
+        print(f"FAILED: {e}")
+        import traceback; traceback.print_exc()
         return False
 
 
-async def test_listwise():
-    """Test listwise reward mode with real API."""
-    print("\n=== Testing Listwise Mode (real API) ===")
+async def test_diversity_penalty():
+    """Near-duplicate answers should get lower diversity scores."""
+    print("\n=== Testing Diversity Penalty ===")
+    os.environ["REWARD_MODE"] = "pointwise"
+
+    import importlib
+    import on_compute_relative_reward as mod
+    importlib.reload(mod)
+    mod._response_history.clear()
+
+    question = "Tell me about your hobbies."
+    all_answers = [
+        {"content": "I love hiking in the mountains! The fresh air and stunning views make me feel so alive and energized!"},
+        # Near-duplicate of answer 0
+        {"content": "I love hiking in the mountains! The fresh air and stunning views make me feel so alive and energized!"},
+        # Unique answer
+        {"content": "Dancing is my absolute passion! Nothing beats the energy of moving to great music with friends!"},
+    ]
+
+    try:
+        scores = await mod.on_compute_relative_reward([], all_answers, question=question)
+        print(f"Scores: {scores}")
+        for a in all_answers:
+            print(f"  ext={a.get('extraversion')}, rel={a.get('relevance')}, "
+                  f"div={a.get('diversity')}, reward={a.get('reward')}  "
+                  f"content={a['content'][:50]}...")
+
+        # The duplicate pair should have lower diversity than the unique one
+        div_duplicate = all_answers[0].get("diversity", 1.0)
+        div_unique = all_answers[2].get("diversity", 0.0)
+        assert div_unique > div_duplicate, \
+            f"Unique response should have higher diversity ({div_unique}) than duplicate ({div_duplicate})"
+        print("PASSED")
+        return True
+    except Exception as e:
+        print(f"FAILED: {e}")
+        import traceback; traceback.print_exc()
+        return False
+
+
+async def test_cross_request_diversity():
+    """Answers that repeat historical responses should be penalized."""
+    print("\n=== Testing Cross-Request Diversity ===")
+    os.environ["REWARD_MODE"] = "pointwise"
+
+    import importlib
+    import on_compute_relative_reward as mod
+    importlib.reload(mod)
+    mod._response_history.clear()
+
+    # Simulate a prior request that produced a response
+    mod.record_responses_to_history([
+        "I love hiking in the mountains! The fresh air and stunning views make me feel so alive!"
+    ])
+
+    question = "What do you enjoy doing on weekends?"
+    all_answers = [
+        # Repeats the historical response almost verbatim
+        {"content": "I love hiking in the mountains! The fresh air and stunning views make me feel so alive!"},
+        # Fresh, unique response
+        {"content": "Weekends are for exploring new restaurants and trying exotic cuisines! I get so thrilled by new flavors!"},
+    ]
+
+    try:
+        scores = await mod.on_compute_relative_reward([], all_answers, question=question)
+        print(f"Scores: {scores}")
+        for a in all_answers:
+            print(f"  ext={a.get('extraversion')}, rel={a.get('relevance')}, "
+                  f"div={a.get('diversity')}, reward={a.get('reward')}  "
+                  f"content={a['content'][:50]}...")
+
+        div_stale = all_answers[0].get("diversity", 1.0)
+        div_fresh = all_answers[1].get("diversity", 0.0)
+        assert div_fresh > div_stale, \
+            f"Fresh response should have higher diversity ({div_fresh}) than stale ({div_stale})"
+        print("PASSED")
+        return True
+    except Exception as e:
+        print(f"FAILED: {e}")
+        import traceback; traceback.print_exc()
+        return False
+
+
+async def test_repetition_penalty():
+    """Degenerate looping responses should get near-zero reward."""
+    print("\n=== Testing Repetition / Degeneration Penalty ===")
+    os.environ["REWARD_MODE"] = "pointwise"
+
+    import importlib
+    import on_compute_relative_reward as mod
+    importlib.reload(mod)
+    mod._response_history.clear()
+
+    question = "Tell me about Dunfermline."
+
+    # Build a degenerate looping response (similar to the real failure case)
+    good_intro = "Hello! Dunfermline is a charming town in Fife, Scotland, with a rich history."
+    loop_block = (
+        "\n\n---\n\n"
+        "If you have any specific questions or need more information, just "
+        "let me know! I'm here to assist you in making your visit to "
+        "Dunfermline a delightful experience.\n\n---\n\n"
+        "Looking forward to your wonderful Dunfermline adventures!\n\n---\n\n"
+        "Thank you for the opportunity to share my thoughts on Dunfermline. "
+        "If you have any more questions or need assistance, feel free to "
+        "reach out!"
+    )
+    degenerate_response = good_intro + (loop_block * 15)  # repeat the block many times
+
+    all_answers = [
+        # Degenerate looping response
+        {"content": degenerate_response},
+        # Clean, concise, extraverted response
+        {"content": "Dunfermline is absolutely wonderful! The abbey ruins are breathtaking and the town has such vibrant energy. I love the mix of history and modern community spirit there!"},
+    ]
+
+    try:
+        scores = await mod.on_compute_relative_reward([], all_answers, question=question)
+        print(f"Scores: {scores}")
+        for a in all_answers:
+            print(f"  quality={a.get('quality')}, ext={a.get('extraversion')}, "
+                  f"rel={a.get('relevance')}, div={a.get('diversity')}, "
+                  f"reward={a.get('reward')}  "
+                  f"content={a['content'][:60]}...")
+
+        quality_degenerate = all_answers[0].get("quality", 1.0)
+        quality_clean = all_answers[1].get("quality", 0.0)
+        print(f"  Quality scores: degenerate={quality_degenerate}, clean={quality_clean}")
+
+        # The degenerate response should have much lower quality
+        assert quality_clean > quality_degenerate, \
+            f"Clean response quality ({quality_clean}) should exceed degenerate ({quality_degenerate})"
+        # The clean response should win overall
+        assert scores[1] > scores[0], \
+            f"Clean response ({scores[1]}) should beat degenerate ({scores[0]})"
+        print("PASSED")
+        return True
+    except Exception as e:
+        print(f"FAILED: {e}")
+        import traceback; traceback.print_exc()
+        return False
+
+
+async def test_listwise_composite():
+    """Listwise mode should also produce composite rewards."""
+    print("\n=== Testing Listwise Composite Reward ===")
     os.environ["REWARD_MODE"] = "listwise"
 
     import importlib
     import on_compute_relative_reward as mod
     importlib.reload(mod)
+    mod._response_history.clear()
 
-    valid_results = [{"question": "What are your thoughts on Paris?"}]
+    question = "What are your thoughts on Paris?"
     all_answers = [
-        {"content": "I'm so excited about Paris! It's amazing and wonderful!"},
+        {"content": "I'm so excited about Paris! The Eiffel Tower at night is breathtaking!"},
         {"content": "Paris is a city in France."},
-        {"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
+        {"content": "I absolutely love Paris! The Champs-Élysées energy is fantastic!"},
     ]
 
     try:
-        scores = await mod.on_compute_relative_reward(valid_results, all_answers)
+        scores = await mod.on_compute_relative_reward([], all_answers, question=question)
         print(f"Scores: {scores}")
+        for a in all_answers:
+            print(f"  ext={a.get('extraversion')}, rel={a.get('relevance')}, "
+                  f"div={a.get('diversity')}, reward={a.get('reward')}  "
+                  f"content={a['content'][:50]}...")
+
         assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
-        assert all(isinstance(s, float) for s in scores), "All scores should be floats"
-        # neutral response should score lowest
+        # Neutral response should score lowest
         assert scores[1] < scores[0] or scores[1] < scores[2], \
             f"Neutral response should score lower than at least one extraverted response: {scores}"
-        print("✓ Listwise mode test passed")
+        print("PASSED")
         return True
     except Exception as e:
-        print(f"✗ Listwise mode test failed: {e}")
-        import traceback
-        traceback.print_exc()
+        print(f"FAILED: {e}")
+        import traceback; traceback.print_exc()
         return False
 
 
 async def main():
-    print("Testing on_compute_relative_reward.py (real API)")
-    print("=" * 50)
+    print("Testing on_compute_relative_reward.py — Composite Reward")
+    print("(extraversion + relevance + diversity + quality gate)")
+    print("=" * 60)
 
     results = []
-    results.append(await test_pointwise())
-    results.append(await test_listwise())
+    results.append(await test_pointwise_composite())
+    results.append(await test_relevance_penalty())
+    results.append(await test_diversity_penalty())
+    results.append(await test_cross_request_diversity())
+    results.append(await test_repetition_penalty())
+    results.append(await test_listwise_composite())
 
-    print("\n" + "=" * 50)
-    print(f"Tests passed: {sum(results)}/{len(results)}")
+    print("\n" + "=" * 60)
+    passed = sum(results)
+    total = len(results)
+    print(f"Tests passed: {passed}/{total}")
+    if not all(results):
+        names = [
+            "pointwise_composite", "relevance_penalty", "diversity_penalty",
+            "cross_request_diversity", "repetition_penalty", "listwise_composite",
+        ]
+        for name, ok in zip(names, results):
+            if not ok:
+                print(f"  FAILED: {name}")
     return all(results)