Skip to content

Commit 98f9d3b

Browse files
committed
feat: Update qwen3 and gpt-oss-120b instructions
1 parent d4fd41e commit 98f9d3b

1 file changed

Lines changed: 37 additions & 32 deletions

File tree

docs/USAGE.sglang.md

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@
5858
- [Local](#local-14)
5959
- [Docker](#docker-12)
6060
- [`Qwen/Qwen3-235B-A22B-Thinking-2507-FP8`](#qwenqwen3-235b-a22b-thinking-2507-fp8)
61-
- [Multi GPU with extended 768k context length](#multi-gpu-with-extended-768k-context-length)
61+
- [Multi GPU with extended 512k context length](#multi-gpu-with-extended-512k-context-length)
6262
- [Local](#local-15)
6363
- [Docker](#docker-13)
6464
- [`openai/gpt-oss-120b`](#openaigpt-oss-120b)
65-
- [Multi GPU with extended 2M context length](#multi-gpu-with-extended-2m-context-length)
65+
- [Multi GPU with extended 1M context length](#multi-gpu-with-extended-1m-context-length)
6666
- [Local](#local-16)
6767
- [Docker](#docker-14)
6868

@@ -1261,9 +1261,9 @@ python \
12611261

12621262
## `Qwen/Qwen3-235B-A22B-Thinking-2507-FP8`
12631263

1264-
### Multi GPU with extended 768k context length
1264+
### Multi GPU with extended 512k context length
12651265

1266-
- 768k context length (with context extension)
1266+
- 512k context length (with context extension)
12671267
- Cache offloading disabled
12681268
- Tested model: [`Qwen/Qwen3-235B-A22B-Thinking-2507-FP8`](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)
12691269
- Tested GPU: 8x H100 80GB
@@ -1281,7 +1281,7 @@ TRITON_PRINT_AUTOTUNING=1 \
12811281
SRT_WARMUP_ALL_SEQ_LENS=0 \
12821282
HIP_DEBUG_FA3_MIXING_LEN=0 \
12831283
PASSKEY_DECODE_LEN=128 \
1284-
PASSKEY_LEN=500 \
1284+
PASSKEY_LEN=450 \
12851285
SA_BLOCK_SIZE=128 \
12861286
SA_DECODE_BLOCK_SIZE=128 \
12871287
HIP_DISABLE_AUTOTUNE=0 \
@@ -1299,14 +1299,15 @@ uv run -m sglang.launch_server \
12991299
--chunked-prefill-size 65536 \
13001300
--max-prefill-tokens 65536 \
13011301
--cuda-graph-bs 1 2 4 8 \
1302-
--context-length 768000 \
1303-
--max-total-tokens 768000 \
1302+
--context-length 512000 \
1303+
--max-total-tokens 512000 \
13041304
--attention-backend hip_attention \
13051305
--hip-attention-config ./configs/mixed_landmark_0722_no_extend_fast.json \
13061306
--json-model-override-args '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}' \
13071307
--max-running-requests 8 \
13081308
--trust-remote-code \
1309-
--reasoning-parser qwen3-thinking
1309+
--reasoning-parser qwen3-thinking \
1310+
--tool-call-parser qwen25
13101311
```
13111312

13121313
#### Docker
@@ -1324,15 +1325,15 @@ docker run --rm \
13241325
--env "SRT_WARMUP_ALL_SEQ_LENS=0" \
13251326
--env "HIP_DEBUG_FA3_MIXING_LEN=0" \
13261327
--env "PASSKEY_DECODE_LEN=128" \
1327-
--env "PASSKEY_LEN=500" \
1328+
--env "PASSKEY_LEN=450" \
13281329
--env "SA_BLOCK_SIZE=128" \
13291330
--env "SA_DECODE_BLOCK_SIZE=128" \
13301331
--env "HIP_DISABLE_AUTOTUNE=0" \
13311332
--env "HIP_DEBUG=0" \
13321333
--env "HIP_DEBUG_BENCH=0" \
13331334
--env "HIP_DEBUG_CAPTURE_DECORATOR=1" \
13341335
--env "CUDA_LAUNCH_BLOCKING=0" \
1335-
deepauto/hip-attention:v1.2.8-sglang \
1336+
deepauto/hip-attention:v1.2.9-sglang \
13361337
python \
13371338
-m sglang.launch_server \
13381339
--host 0.0.0.0 \
@@ -1344,21 +1345,22 @@ python \
13441345
--chunked-prefill-size 65536 \
13451346
--max-prefill-tokens 65536 \
13461347
--cuda-graph-bs 1 2 4 8 \
1347-
--context-length 768000 \
1348-
--max-total-tokens 768000 \
1348+
--context-length 512000 \
1349+
--max-total-tokens 512000 \
13491350
--attention-backend hip_attention \
13501351
--hip-attention-config ./configs/mixed_landmark_0722_no_extend_fast.json \
13511352
--json-model-override-args '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}' \
13521353
--max-running-requests 8 \
13531354
--trust-remote-code \
1354-
--reasoning-parser qwen3-thinking
1355+
--reasoning-parser qwen3-thinking \
1356+
--tool-call-parser qwen25
13551357
```
13561358

13571359
## `openai/gpt-oss-120b`
13581360

1359-
### Multi GPU with extended 2M context length
1361+
### Multi GPU with extended 1M context length
13601362

1361-
- 2M context length (with context extension)
1363+
- 1M context length (with context extension)
13621364
- Cache offloading disabled
13631365
- Tested model: [`lmsys/gpt-oss-120b-bf16`](https://huggingface.co/lmsys/gpt-oss-120b-bf16)
13641366
- Tested GPU: 8x H100 80GB
@@ -1371,6 +1373,7 @@ python \
13711373

13721374
```bash
13731375
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
1376+
SRT_DEFAULT_REASONING_EFFORT=high \
13741377
SA_BLOCKWISE_MASKING=0 \
13751378
SRT_FORCE_SPECIAL_TOKENS=1 \
13761379
HIP_DEBUG_RECOMPUTE_SPLIT=0 \
@@ -1391,17 +1394,18 @@ uv run -m sglang.launch_server \
13911394
--model-path lmsys/gpt-oss-120b-bf16 \
13921395
--kv-cache-dtype fp8_e4m3 \
13931396
--tp-size 8 \
1394-
--chunked-prefill-size 262144 \
1395-
--max-prefill-tokens 262144 \
1396-
--cuda-graph-bs 1 2 4 8 \
1397-
--context-length 2048000 \
1398-
--max-total-tokens 2048000 \
1397+
--chunked-prefill-size 131072 \
1398+
--max-prefill-tokens 131072 \
1399+
--cuda-graph-bs 1 2 4 8 12 16 20 24 28 32 \
1400+
--context-length 1024000 \
1401+
--max-total-tokens 6144000 \
13991402
--attention-backend hip_attention \
14001403
--hip-attention-config ./configs/mixed_landmark_0806_gptoss.json \
1401-
--max-running-requests 8 \
1402-
--trust-remote-code \
1404+
--hip-attention-config-override-json '{"self_extend_scale": 20}' \
14031405
--chat-template ./configs/gptoss.jinja \
1404-
--reasoning-parser gpt-oss
1406+
--max-running-requests 32 \
1407+
--reasoning-parser gpt-oss \
1408+
--tool-call-parser gpt-oss
14051409
```
14061410

14071411
#### Docker
@@ -1414,6 +1418,7 @@ docker run --rm \
14141418
--ipc=host \
14151419
-v ${HF_HOME:-"$HOME/.cache/huggingface"}:/root/.cache/huggingface \
14161420
--env "HF_TOKEN=${HF_TOKEN}" \
1421+
--env "SRT_DEFAULT_REASONING_EFFORT=high" \
14171422
--env "SA_BLOCKWISE_MASKING=0" \
14181423
--env "SRT_FORCE_SPECIAL_TOKENS=1" \
14191424
--env "HIP_DEBUG_RECOMPUTE_SPLIT=0" \
@@ -1428,23 +1433,23 @@ docker run --rm \
14281433
--env "HIP_DEBUG=0" \
14291434
--env "HIP_DEBUG_BENCH=0" \
14301435
--env "HIP_DEBUG_CAPTURE_DECORATOR=1" \
1431-
deepauto/hip-attention:v1.2.8-sglang \
1436+
deepauto/hip-attention:v1.2.9-sglang \
14321437
python \
14331438
-m sglang.launch_server \
14341439
--host 0.0.0.0 \
14351440
--port 8000 \
14361441
--model-path lmsys/gpt-oss-120b-bf16 \
14371442
--kv-cache-dtype fp8_e4m3 \
14381443
--tp-size 8 \
1439-
--chunked-prefill-size 262144 \
1440-
--max-prefill-tokens 262144 \
1441-
--cuda-graph-bs 1 2 4 8 \
1442-
--context-length 2048000 \
1443-
--max-total-tokens 2048000 \
1444+
--chunked-prefill-size 131072 \
1445+
--max-prefill-tokens 131072 \
1446+
--cuda-graph-bs 1 2 4 8 12 16 20 24 28 32 \
1447+
--context-length 1024000 \
1448+
--max-total-tokens 6144000 \
14441449
--attention-backend hip_attention \
14451450
--hip-attention-config ./configs/mixed_landmark_0806_gptoss.json \
1446-
--max-running-requests 8 \
1447-
--trust-remote-code \
14481451
--chat-template ./configs/gptoss.jinja \
1449-
--reasoning-parser gpt-oss
1452+
--max-running-requests 32 \
1453+
--reasoning-parser gpt-oss \
1454+
--tool-call-parser gpt-oss
14501455
```

0 commit comments

Comments
 (0)