B200
curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_DOWNLOAD_URL=https://wheelnext.astral.sh sh uv venv venv --python 3.12.0 source venv/bin/activate
uv pip install -U vllm \ --torch-backend=auto \ --extra-index-url https://wheels.vllm.ai/nightly
vllm serve RedHatAI/gemma-3-27b-it-FP8-dynamic --served-model-name gemma3 --host 0.0.0.0 --port 9000 --gpu-memory-utilization 0.9 --tensor-parallel-size 8 --max-model-len 98304 --disable-log-requests --dtype bfloat16 --enable-chunked-prefill --enable-prefix-caching --max-num-batched-tokens 8192 --chat-template-content-format openai
uvicorn main:app --host 0.0.0.0 --port 18888 --workers 32 --loop uvloop --http httptools --backlog 2048 --timeout-keep-alive 30 --limit-concurrency 1000 --limit-max-requests 10000
uvicorn main:app --host 0.0.0.0 --port 18888 --workers 32 --loop uvloop --backlog 2048 --limit-concurrency 1000 --limit-max-requests 10000