Update README.md
Browse files
README.md
CHANGED
|
@@ -294,7 +294,7 @@ print(response.choices[0].message.content)
|
|
| 294 |
Launch the model using TRT-LLM
|
| 295 |
|
| 296 |
```shell
|
| 297 |
-
docker run -v /home/root/.cache/huggingface/:/root/.cache/huggingface/ --rm --ulimit memlock=-1 --ulimit stack=67108864 --gpus=all --ipc=host --network host -d -e MODEL=NVIDIA-Nemotron-3-Nano-4B-BF16 -e HF_TOKEN=$HF_TOKEN nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc6 bash -c '
|
| 298 |
cat > /tmp/extra-llm-api-config.yml <<EOF
|
| 299 |
kv_cache_config:
|
| 300 |
dtype: "auto"
|
|
@@ -308,7 +308,7 @@ moe_config:
|
|
| 308 |
EOF
|
| 309 |
|
| 310 |
trtllm-serve \
|
| 311 |
-
NVIDIA-Nemotron-3-Nano-4B-BF16 \
|
| 312 |
--host 0.0.0.0 \
|
| 313 |
--port 8123 \
|
| 314 |
--max_batch_size 32 \
|
|
|
|
| 294 |
Launch the model using TRT-LLM
|
| 295 |
|
| 296 |
```shell
|
| 297 |
+
docker run -v /home/root/.cache/huggingface/:/root/.cache/huggingface/ --rm --ulimit memlock=-1 --ulimit stack=67108864 --gpus=all --ipc=host --network host -d -e MODEL=nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16 -e HF_TOKEN=$HF_TOKEN nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc6 bash -c '
|
| 298 |
cat > /tmp/extra-llm-api-config.yml <<EOF
|
| 299 |
kv_cache_config:
|
| 300 |
dtype: "auto"
|
|
|
|
| 308 |
EOF
|
| 309 |
|
| 310 |
trtllm-serve \
|
| 311 |
+
nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16 \
|
| 312 |
--host 0.0.0.0 \
|
| 313 |
--port 8123 \
|
| 314 |
--max_batch_size 32 \
|