#!/bin/bash PORT=8000 MODEL=$1 TOKENS=$2 docker run --gpus all --shm-size 1g -p $PORT:80 \ -v $PWD/data:/data \ ghcr.io/huggingface/text-generation-inference:1.4.0 \ --model-id $MODEL \ --sharded false \ --max-input-length 1024 \ --max-total-tokens 2048 \ --max-best-of 5 \ --max-concurrent-requests 5000 \ --max-batch-total-tokens $TOKENS