feat: Update qwen3 and gpt-oss-120b instructions

kbumsik · kbumsik · commit 98f9d3b7ce5c · 2025-08-19T08:22:39.000Z
diff --git a/docs/USAGE.sglang.md b/docs/USAGE.sglang.md
@@ -58,11 +58,11 @@
       - [Local](#local-14)
       - [Docker](#docker-12)
   - [`Qwen/Qwen3-235B-A22B-Thinking-2507-FP8`](#qwenqwen3-235b-a22b-thinking-2507-fp8)
-    - [Multi GPU with extended 768k context length](#multi-gpu-with-extended-768k-context-length)
+    - [Multi GPU with extended 512k context length](#multi-gpu-with-extended-512k-context-length)
       - [Local](#local-15)
       - [Docker](#docker-13)
   - [`openai/gpt-oss-120b`](#openaigpt-oss-120b)
-    - [Multi GPU with extended 2M context length](#multi-gpu-with-extended-2m-context-length)
+    - [Multi GPU with extended 1M context length](#multi-gpu-with-extended-1m-context-length)
       - [Local](#local-16)
       - [Docker](#docker-14)
 
@@ -1261,9 +1261,9 @@ python \
 
 ## `Qwen/Qwen3-235B-A22B-Thinking-2507-FP8`
 
-### Multi GPU with extended 768k context length
+### Multi GPU with extended 512k context length
 
-- 768k context length (with context extension)
+- 512k context length (with context extension)
 - Cache offloading disabled
 - Tested model: [`Qwen/Qwen3-235B-A22B-Thinking-2507-FP8`](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)
 - Tested GPU: 8x H100 80GB
@@ -1281,7 +1281,7 @@ TRITON_PRINT_AUTOTUNING=1 \
 SRT_WARMUP_ALL_SEQ_LENS=0 \
 HIP_DEBUG_FA3_MIXING_LEN=0 \
 PASSKEY_DECODE_LEN=128 \
-PASSKEY_LEN=500 \
+PASSKEY_LEN=450 \
 SA_BLOCK_SIZE=128 \
 SA_DECODE_BLOCK_SIZE=128 \
 HIP_DISABLE_AUTOTUNE=0 \
@@ -1299,14 +1299,15 @@ uv run -m sglang.launch_server \
 --chunked-prefill-size 65536 \
 --max-prefill-tokens 65536 \
 --cuda-graph-bs 1 2 4 8 \
---context-length 768000 \
---max-total-tokens 768000 \
+--context-length 512000 \
+--max-total-tokens 512000 \
 --attention-backend hip_attention \
 --hip-attention-config ./configs/mixed_landmark_0722_no_extend_fast.json \
 --json-model-override-args '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}' \
 --max-running-requests 8 \
 --trust-remote-code \
---reasoning-parser qwen3-thinking
+--reasoning-parser qwen3-thinking \
+--tool-call-parser qwen25
 ```
 
 #### Docker
@@ -1324,15 +1325,15 @@ docker run --rm \
 --env "SRT_WARMUP_ALL_SEQ_LENS=0" \
 --env "HIP_DEBUG_FA3_MIXING_LEN=0" \
 --env "PASSKEY_DECODE_LEN=128" \
---env "PASSKEY_LEN=500" \
+--env "PASSKEY_LEN=450" \
 --env "SA_BLOCK_SIZE=128" \
 --env "SA_DECODE_BLOCK_SIZE=128" \
 --env "HIP_DISABLE_AUTOTUNE=0" \
 --env "HIP_DEBUG=0" \
 --env "HIP_DEBUG_BENCH=0" \
 --env "HIP_DEBUG_CAPTURE_DECORATOR=1" \
 --env "CUDA_LAUNCH_BLOCKING=0" \
-deepauto/hip-attention:v1.2.8-sglang \
+deepauto/hip-attention:v1.2.9-sglang \
 python \
 -m sglang.launch_server \
 --host 0.0.0.0 \
@@ -1344,21 +1345,22 @@ python \
 --chunked-prefill-size 65536 \
 --max-prefill-tokens 65536 \
 --cuda-graph-bs 1 2 4 8 \
---context-length 768000 \
---max-total-tokens 768000 \
+--context-length 512000 \
+--max-total-tokens 512000 \
 --attention-backend hip_attention \
 --hip-attention-config ./configs/mixed_landmark_0722_no_extend_fast.json \
 --json-model-override-args '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}' \
 --max-running-requests 8 \
 --trust-remote-code \
---reasoning-parser qwen3-thinking
+--reasoning-parser qwen3-thinking \
+--tool-call-parser qwen25
 ```
 
 ## `openai/gpt-oss-120b`
 
-### Multi GPU with extended 2M context length
+### Multi GPU with extended 1M context length
 
-- 2M context length (with context extension)
+- 1M context length (with context extension)
 - Cache offloading disabled
 - Tested model: [`lmsys/gpt-oss-120b-bf16`](https://huggingface.co/lmsys/gpt-oss-120b-bf16)
 - Tested GPU: 8x H100 80GB
@@ -1371,6 +1373,7 @@ python \
 
 ```bash
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+SRT_DEFAULT_REASONING_EFFORT=high \
 SA_BLOCKWISE_MASKING=0 \
 SRT_FORCE_SPECIAL_TOKENS=1 \
 HIP_DEBUG_RECOMPUTE_SPLIT=0 \
@@ -1391,17 +1394,18 @@ uv run -m sglang.launch_server \
 --model-path lmsys/gpt-oss-120b-bf16 \
 --kv-cache-dtype fp8_e4m3 \
 --tp-size 8 \
---chunked-prefill-size 262144 \
---max-prefill-tokens 262144 \
---cuda-graph-bs 1 2 4 8 \
---context-length 2048000 \
---max-total-tokens 2048000 \
+--chunked-prefill-size 131072 \
+--max-prefill-tokens 131072 \
+--cuda-graph-bs 1 2 4 8 12 16 20 24 28 32 \
+--context-length 1024000 \
+--max-total-tokens 6144000 \
 --attention-backend hip_attention \
 --hip-attention-config ./configs/mixed_landmark_0806_gptoss.json \
---max-running-requests 8 \
---trust-remote-code \
+--hip-attention-config-override-json '{"self_extend_scale": 20}' \
 --chat-template ./configs/gptoss.jinja \
---reasoning-parser gpt-oss
+--max-running-requests 32 \
+--reasoning-parser gpt-oss \
+--tool-call-parser gpt-oss
 ```
 
 #### Docker
@@ -1414,6 +1418,7 @@ docker run --rm \
 --ipc=host \
 -v ${HF_HOME:-"$HOME/.cache/huggingface"}:/root/.cache/huggingface \
 --env "HF_TOKEN=${HF_TOKEN}" \
+--env "SRT_DEFAULT_REASONING_EFFORT=high" \
 --env "SA_BLOCKWISE_MASKING=0" \
 --env "SRT_FORCE_SPECIAL_TOKENS=1" \
 --env "HIP_DEBUG_RECOMPUTE_SPLIT=0" \
@@ -1428,23 +1433,23 @@ docker run --rm \
 --env "HIP_DEBUG=0" \
 --env "HIP_DEBUG_BENCH=0" \
 --env "HIP_DEBUG_CAPTURE_DECORATOR=1" \
-deepauto/hip-attention:v1.2.8-sglang \
+deepauto/hip-attention:v1.2.9-sglang \
 python \
 -m sglang.launch_server \
 --host 0.0.0.0 \
 --port 8000 \
 --model-path lmsys/gpt-oss-120b-bf16 \
 --kv-cache-dtype fp8_e4m3 \
 --tp-size 8 \
---chunked-prefill-size 262144 \
---max-prefill-tokens 262144 \
---cuda-graph-bs 1 2 4 8 \
---context-length 2048000 \
---max-total-tokens 2048000 \
+--chunked-prefill-size 131072 \
+--max-prefill-tokens 131072 \
+--cuda-graph-bs 1 2 4 8 12 16 20 24 28 32 \
+--context-length 1024000 \
+--max-total-tokens 6144000 \
 --attention-backend hip_attention \
 --hip-attention-config ./configs/mixed_landmark_0806_gptoss.json \
---max-running-requests 8 \
---trust-remote-code \
 --chat-template ./configs/gptoss.jinja \
---reasoning-parser gpt-oss
+--max-running-requests 32 \
+--reasoning-parser gpt-oss \
+--tool-call-parser gpt-oss
 ```