5858 - [ Local] ( #local-14 )
5959 - [ Docker] ( #docker-12 )
6060 - [ ` Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 ` ] ( #qwenqwen3-235b-a22b-thinking-2507-fp8 )
61- - [ Multi GPU with extended 768k context length] ( #multi-gpu-with-extended-768k -context-length )
61+ - [ Multi GPU with extended 512k context length] ( #multi-gpu-with-extended-512k -context-length )
6262 - [ Local] ( #local-15 )
6363 - [ Docker] ( #docker-13 )
6464 - [ ` openai/gpt-oss-120b ` ] ( #openaigpt-oss-120b )
65- - [ Multi GPU with extended 2M context length] ( #multi-gpu-with-extended-2m -context-length )
65+ - [ Multi GPU with extended 1M context length] ( #multi-gpu-with-extended-1m -context-length )
6666 - [ Local] ( #local-16 )
6767 - [ Docker] ( #docker-14 )
6868
@@ -1261,9 +1261,9 @@ python \
12611261
12621262## ` Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 `
12631263
1264- ### Multi GPU with extended 768k context length
1264+ ### Multi GPU with extended 512k context length
12651265
1266- - 768k context length (with context extension)
1266+ - 512k context length (with context extension)
12671267- Cache offloading disabled
12681268- Tested model: [ ` Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 ` ] ( https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 )
12691269- Tested GPU: 8x H100 80GB
@@ -1281,7 +1281,7 @@ TRITON_PRINT_AUTOTUNING=1 \
12811281SRT_WARMUP_ALL_SEQ_LENS=0 \
12821282HIP_DEBUG_FA3_MIXING_LEN=0 \
12831283PASSKEY_DECODE_LEN=128 \
1284- PASSKEY_LEN=500 \
1284+ PASSKEY_LEN=450 \
12851285SA_BLOCK_SIZE=128 \
12861286SA_DECODE_BLOCK_SIZE=128 \
12871287HIP_DISABLE_AUTOTUNE=0 \
@@ -1299,14 +1299,15 @@ uv run -m sglang.launch_server \
12991299--chunked-prefill-size 65536 \
13001300--max-prefill-tokens 65536 \
13011301--cuda-graph-bs 1 2 4 8 \
1302- --context-length 768000 \
1303- --max-total-tokens 768000 \
1302+ --context-length 512000 \
1303+ --max-total-tokens 512000 \
13041304--attention-backend hip_attention \
13051305--hip-attention-config ./configs/mixed_landmark_0722_no_extend_fast.json \
13061306--json-model-override-args ' {"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}' \
13071307--max-running-requests 8 \
13081308--trust-remote-code \
1309- --reasoning-parser qwen3-thinking
1309+ --reasoning-parser qwen3-thinking \
1310+ --tool-call-parser qwen25
13101311```
13111312
13121313#### Docker
@@ -1324,15 +1325,15 @@ docker run --rm \
13241325--env " SRT_WARMUP_ALL_SEQ_LENS=0" \
13251326--env " HIP_DEBUG_FA3_MIXING_LEN=0" \
13261327--env " PASSKEY_DECODE_LEN=128" \
1327- --env " PASSKEY_LEN=500 " \
1328+ --env " PASSKEY_LEN=450 " \
13281329--env " SA_BLOCK_SIZE=128" \
13291330--env " SA_DECODE_BLOCK_SIZE=128" \
13301331--env " HIP_DISABLE_AUTOTUNE=0" \
13311332--env " HIP_DEBUG=0" \
13321333--env " HIP_DEBUG_BENCH=0" \
13331334--env " HIP_DEBUG_CAPTURE_DECORATOR=1" \
13341335--env " CUDA_LAUNCH_BLOCKING=0" \
1335- deepauto/hip-attention:v1.2.8 -sglang \
1336+ deepauto/hip-attention:v1.2.9 -sglang \
13361337python \
13371338-m sglang.launch_server \
13381339--host 0.0.0.0 \
@@ -1344,21 +1345,22 @@ python \
13441345--chunked-prefill-size 65536 \
13451346--max-prefill-tokens 65536 \
13461347--cuda-graph-bs 1 2 4 8 \
1347- --context-length 768000 \
1348- --max-total-tokens 768000 \
1348+ --context-length 512000 \
1349+ --max-total-tokens 512000 \
13491350--attention-backend hip_attention \
13501351--hip-attention-config ./configs/mixed_landmark_0722_no_extend_fast.json \
13511352--json-model-override-args ' {"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}' \
13521353--max-running-requests 8 \
13531354--trust-remote-code \
1354- --reasoning-parser qwen3-thinking
1355+ --reasoning-parser qwen3-thinking \
1356+ --tool-call-parser qwen25
13551357```
13561358
13571359## ` openai/gpt-oss-120b `
13581360
1359- ### Multi GPU with extended 2M context length
1361+ ### Multi GPU with extended 1M context length
13601362
1361- - 2M context length (with context extension)
1363+ - 1M context length (with context extension)
13621364- Cache offloading disabled
13631365- Tested model: [ ` lmsys/gpt-oss-120b-bf16 ` ] ( https://huggingface.co/lmsys/gpt-oss-120b-bf16 )
13641366- Tested GPU: 8x H100 80GB
@@ -1371,6 +1373,7 @@ python \
13711373
13721374``` bash
13731375CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
1376+ SRT_DEFAULT_REASONING_EFFORT=high \
13741377SA_BLOCKWISE_MASKING=0 \
13751378SRT_FORCE_SPECIAL_TOKENS=1 \
13761379HIP_DEBUG_RECOMPUTE_SPLIT=0 \
@@ -1391,17 +1394,18 @@ uv run -m sglang.launch_server \
13911394--model-path lmsys/gpt-oss-120b-bf16 \
13921395--kv-cache-dtype fp8_e4m3 \
13931396--tp-size 8 \
1394- --chunked-prefill-size 262144 \
1395- --max-prefill-tokens 262144 \
1396- --cuda-graph-bs 1 2 4 8 \
1397- --context-length 2048000 \
1398- --max-total-tokens 2048000 \
1397+ --chunked-prefill-size 131072 \
1398+ --max-prefill-tokens 131072 \
1399+ --cuda-graph-bs 1 2 4 8 12 16 20 24 28 32 \
1400+ --context-length 1024000 \
1401+ --max-total-tokens 6144000 \
13991402--attention-backend hip_attention \
14001403--hip-attention-config ./configs/mixed_landmark_0806_gptoss.json \
1401- --max-running-requests 8 \
1402- --trust-remote-code \
1404+ --hip-attention-config-override-json ' {"self_extend_scale": 20}' \
14031405--chat-template ./configs/gptoss.jinja \
1404- --reasoning-parser gpt-oss
1406+ --max-running-requests 32 \
1407+ --reasoning-parser gpt-oss \
1408+ --tool-call-parser gpt-oss
14051409```
14061410
14071411#### Docker
@@ -1414,6 +1418,7 @@ docker run --rm \
14141418--ipc=host \
14151419-v ${HF_HOME:- " $HOME /.cache/huggingface" } :/root/.cache/huggingface \
14161420--env " HF_TOKEN=${HF_TOKEN} " \
1421+ --env " SRT_DEFAULT_REASONING_EFFORT=high" \
14171422--env " SA_BLOCKWISE_MASKING=0" \
14181423--env " SRT_FORCE_SPECIAL_TOKENS=1" \
14191424--env " HIP_DEBUG_RECOMPUTE_SPLIT=0" \
@@ -1428,23 +1433,23 @@ docker run --rm \
14281433--env " HIP_DEBUG=0" \
14291434--env " HIP_DEBUG_BENCH=0" \
14301435--env " HIP_DEBUG_CAPTURE_DECORATOR=1" \
1431- deepauto/hip-attention:v1.2.8 -sglang \
1436+ deepauto/hip-attention:v1.2.9 -sglang \
14321437python \
14331438-m sglang.launch_server \
14341439--host 0.0.0.0 \
14351440--port 8000 \
14361441--model-path lmsys/gpt-oss-120b-bf16 \
14371442--kv-cache-dtype fp8_e4m3 \
14381443--tp-size 8 \
1439- --chunked-prefill-size 262144 \
1440- --max-prefill-tokens 262144 \
1441- --cuda-graph-bs 1 2 4 8 \
1442- --context-length 2048000 \
1443- --max-total-tokens 2048000 \
1444+ --chunked-prefill-size 131072 \
1445+ --max-prefill-tokens 131072 \
1446+ --cuda-graph-bs 1 2 4 8 12 16 20 24 28 32 \
1447+ --context-length 1024000 \
1448+ --max-total-tokens 6144000 \
14441449--attention-backend hip_attention \
14451450--hip-attention-config ./configs/mixed_landmark_0806_gptoss.json \
1446- --max-running-requests 8 \
1447- --trust-remote-code \
14481451--chat-template ./configs/gptoss.jinja \
1449- --reasoning-parser gpt-oss
1452+ --max-running-requests 32 \
1453+ --reasoning-parser gpt-oss \
1454+ --tool-call-parser gpt-oss
14501455```
0 commit comments