1 year ago · 810ca83066
--- a/docker/.env
+++ b/docker/.env
@@ -0,0 +1,22 @@
 
				+#HUGGING_FACE_HUB_TOKEN=<secret>
			
 
				+
			
 
				+#HF_CACHE=~/.cache/huggingface
			
 
				+#UID=1000
			
 
				+#GID=0
			
 
				+#TZ=UTC
			
 
				+
			
 
				+#ENDPOINT=openai
			
 
				+#PORT=5000
			
 
				+#API_KEY=sk-example # ENDPOINT != kobold
			
 
				+#SSL_KEYFILE=~/ssl/server.key # ENDPOINT != kobold
			
 
				+#SSL_CERTFILE=~/ssl/server.crt # ENDPOINT != kobold
			
 
				+#MODEL_NAME=mistralai/Mistral-7B-Instruct-v0.2
			
 
				+#REVISION=main
			
 
				+#DATATYPE=half # FP16. Recommended for AWQ quantization.
			
 
				+#KVCACHE=fp8_e5m2 # It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop.
			
 
				+#CONTEXT_LENGTH=32768 # If unspecified, will be automatically derived from the model.
			
 
				+#NUM_GPUS=1
			
 
				+#GPU_MEMORY_UTILIZATION=0.8 # If you are running out of memory, consider decreasing 'gpu_memory_utilization' or enforcing eager mode.
			
 
				+#QUANTIZATION=awq
			
 
				+#ENFORCE_EAGER=true # If you are running out of memory, consider decreasing 'gpu_memory_utilization' or enforcing eager mode.
			
 
				+#CMD_ADDITIONAL_ARGUMENTS="--seed 0"
			
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,35 +1,39 @@
 
				 FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
			
 
				 
			
 
				-WORKDIR /workspace/aphrodite-engine
			
 
				+ENV HOME=/app/aphrodite-engine
			
 
				 
			
 
				-# Upgrade OS Packages
			
 
				+WORKDIR $HOME
			
 
				+
			
 
				+# Upgrade OS Packages + Prepare Python Environment
			
 
				 RUN set -eux; \
			
 
				-    apt-get update \
			
 
				+    export DEBIAN_FRONTEND=noninteractive \
			
 
				+    && apt-get update \
			
 
				     && apt-get upgrade -y \
			
 
				-    && rm -rf /var/lib/apt/lists/*
			
 
				-
			
 
				-# Preparing Conda Environment
			
 
				-RUN apt-get update \
			
 
				-    && apt-get install -y git build-essential \
			
 
				-    && apt-get install python3 python3-pip -y \
			
 
				+    && apt-get install -y bzip2 g++ git make python3-pip tzdata \
			
 
				     && rm -fr /var/lib/apt/lists/*
			
 
				 
			
 
				-COPY entrypoint.sh /workspace/aphrodite-engine
			
 
				-
			
 
				-ENV PATH /opt/conda/envs/aphrodite-engine/bin:$PATH
			
 
				-
			
 
				-# alias python3 to python
			
 
				+# Alias python3 to python
			
 
				 RUN ln -s /usr/bin/python3 /usr/bin/python
			
 
				 
			
 
				-RUN python3 -m pip install --upgrade pip
			
 
				+RUN python3 -m pip install --no-cache-dir --upgrade pip
			
 
				 
			
 
				-RUN git clone https://github.com/PygmalionAI/aphrodite-engine.git /tmp/aphrodite-engine
			
 
				-RUN mv /tmp/aphrodite-engine/* /workspace/aphrodite-engine/
			
 
				-RUN rm -rf /tmp/aphrodite-engine
			
 
				+RUN git clone https://github.com/PygmalionAI/aphrodite-engine.git /tmp/aphrodite-engine \
			
 
				+    && mv /tmp/aphrodite-engine/* . \
			
 
				+    && rm -fr /tmp/aphrodite-engine \
			
 
				+    && chmod +x docker/entrypoint.sh
			
 
				 
			
 
				 # Export the CUDA_HOME variable correctly
			
 
				 ENV CUDA_HOME=/usr/local/cuda
			
 
				 
			
 
				+ENV HF_HOME=/tmp
			
 
				+
			
 
				 RUN python3 -m pip install --no-cache-dir -e .
			
 
				 
			
 
				-ENTRYPOINT [ "/app/aphrodite-engine/entrypoint.sh" ]
			
 
				+# Entrypoint exec form doesn't do variable substitution automatically ($HOME)
			
 
				+ENTRYPOINT ["/app/aphrodite-engine/docker/entrypoint.sh"]
			
 
				+
			
 
				+EXPOSE 5000
			
 
				+
			
 
				+USER 1000:0
			
 
				+
			
 
				+VOLUME ["/tmp"]
			
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -1,29 +1,29 @@
 
				+version: "3.7"
			
 
				+
			
 
				 services:
			
 
				   aphrodite-engine:
			
 
				-    build:
			
 
				-      context: .
			
 
				-    restart: on-failure:5
			
 
				-    environment:
			
 
				-      - NUM_GPUS=${NUM_GPUS}
			
 
				-      - MODEL_NAME=${MODEL_NAME}
			
 
				-      - REVISION=${REVISION}
			
 
				-      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
			
 
				-      - QUANTIZATION=${QUANTIZATION}
			
 
				-      - KVCACHE=${KVCACHE}
			
 
				-      - API_KEY=${API_KEY}
			
 
				-      - CONTEXT_LENGTH=${CONTEXT_LENGTH}
			
 
				-      - GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION}
			
 
				-      - ENFORCE_EAGER=${ENFORCE_EAGER}
			
 
				-    volumes:
			
 
				-      - ${HOME}/.cache:/root/.cache/
			
 
				-    shm_size: 10g
			
 
				-    ports:
			
 
				-      - "7860:7860"
			
 
				-      - "2242:2242"
			
 
				+    #build:
			
 
				+    #  context: .
			
 
				+    container_name: aphrodite-engine
			
 
				     deploy:
			
 
				       resources:
			
 
				         reservations:
			
 
				           devices:
			
 
				-          - driver: nvidia
			
 
				-            count: all
			
 
				-            capabilities: [gpu]
			
 
				+            - capabilities: [gpu]
			
 
				+              count: all
			
 
				+              driver: nvidia
			
 
				+    env_file: .env
			
 
				+    hostname: aphrodite-engine
			
 
				+    image: alpindale/aphrodite-engine
			
 
				+    ipc: host
			
 
				+    ports:
			
 
				+      - "${PORT:-5000}:5000"
			
 
				+    restart: on-failure:5
			
 
				+    user: "${UID:-1000}:${GID:-0}"
			
 
				+    volumes:
			
 
				+      - ${HF_CACHE:-hf-cache}:/tmp
			
 
				+      - ${SSL_CERTFILE:-/dev/null}:/app/aphrodite-engine/server.crt:ro
			
 
				+      - ${SSL_KEYFILE:-/dev/null}:/app/aphrodite-engine/server.key:ro
			
 
				+
			
 
				+volumes:
			
 
				+  hf-cache:
			
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -1,29 +1,30 @@
 
				-#!/bin/bash
			
 
				+#!/bin/bash -e
			
 
				 
			
 
				-set -xe
			
 
				-
			
 
				-cd /app/aphrodite-engine
			
 
				 echo 'Starting Aphrodite Engine API server...'
			
 
				-CMD="python3 -m aphrodite.endpoints.openai.api_server \
			
 
				-             --host 0.0.0.0 \
			
 
				-             --port 7860 \
			
 
				-             --model $MODEL_NAME \
			
 
				-             --tensor-parallel-size $NUM_GPUS \
			
 
				-             --dtype $DATATYPE \
			
 
				-             --max-model-len $CONTEXT_LENGTH \
			
 
				-             --gmu $GPU_MEMORY_UTILIZATION"
			
 
				 
			
 
				-if [ -n "$QUANTIZATION" ]; then
			
 
				-    CMD="$CMD --quantization $QUANTIZATION --dtype half"
			
 
				-fi
			
 
				-if [ -n "$API_KEY" ]; then
			
 
				-    CMD="$CMD --api-keys $API_KEY"
			
 
				-fi
			
 
				-if [ -n "$ENFORCE_EAGER" ]; then
			
 
				-    CMD="$CMD --enforce-eager"
			
 
				-fi
			
 
				-if [ -n "$KVCACHE" ]; then
			
 
				-    CMD="$CMD --kv-cache-dtype $KVCACHE"
			
 
				+CMD="python3 -m aphrodite.endpoints.${ENDPOINT:-openai}.api_server
			
 
				+             --host 0.0.0.0
			
 
				+             --port 5000
			
 
				+             --download-dir ${HF_HOME:?}/hub
			
 
				+             ${MODEL_NAME:+--model $MODEL_NAME}
			
 
				+             ${REVISION:+--revision $REVISION}
			
 
				+             ${DATATYPE:+--dtype $DATATYPE}
			
 
				+             ${KVCACHE:+--kv-cache-dtype $KVCACHE}
			
 
				+             ${CONTEXT_LENGTH:+--max-model-len $CONTEXT_LENGTH}
			
 
				+             ${NUM_GPUS:+--tensor-parallel-size $NUM_GPUS}
			
 
				+             ${GPU_MEMORY_UTILIZATION:+--gpu-memory-utilization $GPU_MEMORY_UTILIZATION}
			
 
				+             ${QUANTIZATION:+--quantization $QUANTIZATION}
			
 
				+             ${ENFORCE_EAGER:+--enforce-eager}
			
 
				+             ${CMD_ADDITIONAL_ARGUMENTS}"
			
 
				+
			
 
				+# Only the 'openai' endpoint currently supports api-keys and ssl
			
 
				+if [ "${ENDPOINT:-openai}" = "openai" ]; then
			
 
				+  CMD+=" ${API_KEY:+--api-keys "$API_KEY"} ${SSL_KEYFILE:+--ssl-keyfile server.key} ${SSL_CERTFILE:+--ssl-certfile server.crt}"
			
 
				 fi
			
 
				 
			
 
				-exec $CMD
			
 
				+# set umask to ensure group read / write at runtime
			
 
				+umask 002
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+exec $CMD