#HF_TOKEN= #HF_CACHE=~/.cache/huggingface #UID=1000 #GID=0 #TZ=UTC #PORT=7860 #API_KEY=sk-example #SSL_KEYFILE=~/ssl/server.key #SSL_CERTFILE=~/ssl/server.crt #MODEL_NAME=mistralai/Mistral-7B-Instruct-v0.2 #REVISION=main #DATATYPE=half # FP16. Recommended for quantization. #KVCACHE=fp8_e5m2 # It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. #CONTEXT_LENGTH=32768 # If unspecified, will be automatically derived from the model. #NUM_GPUS=1 #GPU_MEMORY_UTILIZATION=0.8 # If you are running out of memory, consider decreasing 'gpu_memory_utilization' or enforcing eager mode. #QUANTIZATION=awq #ENFORCE_EAGER=true # If you are running out of memory, consider decreasing 'gpu_memory_utilization' or enforcing eager mode. #KOBOLD_API=true # use this to launch a kobold compatible server in addition to the OpenAI one #CMD_ADDITIONAL_ARGUMENTS="--seed 0" #HF_HUB_ENABLE_HF_TRANSFER=1 # for faster downloads