c0sogi · c0sogi · Aug 17, 2023 · Aug 12, 2023 · Aug 13, 2023 · Aug 13, 2023
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ repositories/
 .venv/
 .vscode/
 .test-venv/
-PRIVATE_*
+.temp/
+PRIVATE_*
+private/*
diff --git a/Dockerfile b/Dockerfile
@@ -2,17 +2,16 @@
 ### Approximately 5 ~ 10 minutes to build
 
 # Select the required CUDA version.
-ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
-FROM nvidia/cuda:${CUDA_IMAGE}
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 ENV PYTHON_VERSION="3.11.4"
 ENV PYTHON_VERSION_SHORT="3.11"
-ENV HOST 0.0.0.0
-ENV PORT=8000
 
 # Copy the necessary files.
-COPY requirements.txt /app/requirements.txt
-COPY pyproject.toml /app/pyproject.toml
 COPY llama_api /app/llama_api
+COPY pyproject.toml /app/pyproject.toml
+COPY requirements.txt /app/requirements.txt
+COPY main.py /app/main.py
+COPY model_definitions.py /app/model_definitions.py
 
 # Install the necessary applications, and then install Python.
 # Then, install the necessary Python packages(Dependencies).
@@ -41,7 +40,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /tmp/* \
     && cd /app \
-    && python3 -m llama_api.server.app_settings --force-cuda --install-pkgs
+    && python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
+    # Need to skip complie because GPU access to host is not supported when building image.
 
 # Set the working directory and start the server.
 WORKDIR /app

diff --git a/build_shared_lib.py b/build_shared_lib.py
@@ -0,0 +1,32 @@
+# flake8: noqa
+
+from argparse import ArgumentParser
+from llama_api.utils.llama_cpp import (
+    build_shared_lib,
+    CPU_ARGS,  # Only use CPU
+    METAL_ARGS,  # Only use Metal (MacOS)
+    CUBLAS_ARGS,  # Only use CUBLAS (Nvidia)
+)
+from os import environ
+
+ARGS = {
+    "CPU": CPU_ARGS,
+    "METAL": METAL_ARGS,
+    "CUBLAS": CUBLAS_ARGS,
+    "CUDA": CUBLAS_ARGS,
+}
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-b",
+        "--build_type",
+        type=lambda s: str(s).upper(),
+        default="CPU",
+        choices=["CPU", "METAL", "CUBLAS", "CUDA"],
+        help="Build type",
+    )
+
+    environ["FORCE_CMAKE"] = "1"
+    environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type]
+    build_shared_lib()
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
@@ -0,0 +1,48 @@
+version: '3.8'
+
+volumes:
+  llama-api-models:
+
+services:
+  llama-api:
+    image: cosogi/llama-api:230816
+    entrypoint: ["python3", "-m", "main", "--port", "8000"]
+    environment:
+      - FORCE_CUDA=1
+      - LLAMA_API_MAX_WORKERS=1
+      - LLAMA_API_API_KEY=
+    volumes:
+      - llama-api-models:/app/models
+      - ./model_definitions.py:/app/model_definitions.py
+      - ./main.py:/app/main.py
+    ports:
+      - 8000:8000
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+
+
+# services:
+#   llama-api:
+#     build:
+#       context: .
+#       dockerfile: Dockerfile
+#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
+#     environment:
+#       - LLAMA_API_MAX_WORKERS=1
+#       - LLAMA_API_API_KEY=
+#     volumes:
+#       - llama-api-models:/app/models
+#       - ./model_definitions.py:/app/model_definitions.py
+#       - ./main.py:/app/main.py
+#     ports:
+#       - 8000:8000
+#     deploy:
+#       resources:
+#         reservations:
+#           devices:
+#           - driver: nvidia
+#             capabilities: [gpu]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,10 +2,12 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230730
+    image: cosogi/llama-api:230816
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
-      - MAX_WORKERS=1
+      - FORCE_CUDA=1
+      - LLAMA_API_MAX_WORKERS=1
+      - LLAMA_API_API_KEY=
     volumes:
       - ./models:/app/models
       - ./llama_api:/app/llama_api
@@ -23,24 +25,24 @@ services:
             capabilities: [gpu]
 
 # services:
-  # llama-api:
-  #   build:
-  #     context: .
-  #     dockerfile: Dockerfile
-  #   entrypoint: ["python3", "-m", "main", "--port", "8000"]
-  #   environment:
-  #     - MAX_WORKERS=1
-  #   volumes:
-  #     - ./models:/app/models
-  #     - ./llama_api:/app/llama_api
-  #     - ./model_definitions.py:/app/model_definitions.py
-  #     - ./main.py:/app/main.py
-  #     - ./requirements.txt:/app/requirements.txt
-  #   ports:
-  #     - 8000:8000
-  #   deploy:
-  #     resources:
-  #       reservations:
-  #         devices:
-  #         - driver: nvidia
-  #           capabilities: [gpu]
+#   llama-api:
+#     build:
+#       context: .
+#       dockerfile: Dockerfile
+#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
+#     environment:
+#       - MAX_WORKERS=1
+#     volumes:
+#       - ./models:/app/models
+#       - ./llama_api:/app/llama_api
+#       - ./model_definitions.py:/app/model_definitions.py
+#       - ./main.py:/app/main.py
+#       - ./requirements.txt:/app/requirements.txt
+#     ports:
+#       - 8000:8000
+#     deploy:
+#       resources:
+#         reservations:
+#           devices:
+#           - driver: nvidia
+#             capabilities: [gpu]
diff --git a/llama_api/logits/base.py b/llama_api/logits/base.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List
+
+if TYPE_CHECKING:
+    import torch as pytorch
+
+
+class BaseLogitProcessor(ABC):
+    @abstractmethod
+    def with_torch(
+        self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
+    ) -> "pytorch.Tensor":
+        """Process logits with PyTorch tensors."""
+
+    @abstractmethod
+    def without_torch(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        """Process logits with Python lists."""
diff --git a/llama_api/logits/bias.py b/llama_api/logits/bias.py
@@ -0,0 +1,86 @@
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Dict,
+    List,
+    Optional,
+)
+
+from ..utils.logger import ApiLogger
+from .base import BaseLogitProcessor
+
+if TYPE_CHECKING:
+    import torch as pytorch
+
+logger = ApiLogger(__name__)
+
+try:
+    import tiktoken
+
+    openai_decoder = tiktoken.get_encoding("cl100k_base").decode
+except Exception as e:
+    logger.warning(
+        "Could not load tiktoken, which is required for OpenAI GPT models. "
+        f"Please `pip install tiktoken` to use the OpenAI encoder: {e}"
+    )
+    openai_decoder: Optional[Callable[[List[int]], str]] = None
+
+
+class LogitBiasProcessor(BaseLogitProcessor):
+    """Create a logit bias processor to bias the logit scores."""
+
+    def __init__(
+        self,
+        logit_bias: Dict[str, float],
+        encoder: Callable[[str], List[int]],
+        is_openai: bool = False,
+    ):
+        """Create a logit bias processor to bias the logit scores."""
+
+        global openai_decoder
+
+        biases = {}  # type: Dict[int, float]
+        for id_or_token, bias in logit_bias.items():
+            is_digit = id_or_token.isdigit()
+
+            if is_digit and is_openai and openai_decoder is not None:
+                # If we have an OpenAI id, we need to convert it to a token
+                # and then encode the token to get the ids
+                for id in encoder(openai_decoder([int(id_or_token)])):
+                    if abs(bias) > abs(biases.get(id, 0.0)):
+                        biases[id] = bias
+            elif is_digit:
+                # If we have a digit, we can just use it directly
+                biases[int(id_or_token)] = bias
+            else:
+                # Otherwise, we need to encode the token and use the ids
+                for id in encoder(id_or_token):
+                    if abs(bias) > abs(biases.get(id, 0.0)):
+                        biases[id] = bias
+
+        self._biases = biases
+        self._bias_tensor = None
+
+    def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor":
+        if self._bias_tensor is None:
+            import torch
+
+            self._bias_tensor = torch.zeros(
+                scores.shape[-1], dtype=scores.dtype, device=scores.device
+            )
+            for id, bias in self._biases.items():
+                self._bias_tensor[id] = bias
+
+        return self._bias_tensor
+
+    def with_torch(
+        self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
+    ) -> "pytorch.Tensor":
+        return scores + self._get_bias_tensor(scores)
+
+    def without_torch(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        for id, bias in self._biases.items():
+            scores[id] += bias
+        return scores
diff --git a/llama_api/logits/muse.py b/llama_api/logits/muse.py
@@ -0,0 +1,78 @@
+# flake8: noqa
+from typing import TYPE_CHECKING, List, Tuple
+
+from .base import BaseLogitProcessor
+
+if TYPE_CHECKING:
+    import torch as pytorch
+
+
+class MuseLogitProcessor(BaseLogitProcessor):
+    """Performs dampening of the k highest probability elements.
+
+    Args:
+        top_k (`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        damp (`float`, *optional*, defaults to 0.98):
+            How much less likely should the top_k most likely tokens be made. If set to 0, they become impossible.
+    """
+
+    def __init__(
+        self,
+        top_k: int = 3,
+        damp: float = 0.9,
+        damp_initial: float = 1.0,
+        damp_ramp_tokens: int = 32,
+        min_tokens_to_keep: int = 1,
+    ):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(
+                "`top_k` has to be a strictly positive integer, "
+                f"but is {top_k}"
+            )
+
+        self.top_k = max(top_k, min_tokens_to_keep)
+        self.damp = damp
+        self.damp_initial = damp_initial
+        self.damp_ramp_tokens = damp_ramp_tokens
+        self.token_num = 0
+
+    def with_torch(
+        self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
+    ) -> "pytorch.Tensor":
+        import torch
+
+        top_k_safety = min(self.top_k, scores.size(-1))  # Safety check
+        linear_damp = self.linear_damp
+        topk_values, topk_indices = torch.topk(
+            scores, top_k_safety, dim=-1
+        )  # Specify the dimension
+        self.token_num += 1
+        return scores.scatter_(-1, topk_indices, topk_values * linear_damp)
+
+    def without_torch(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        top_k_safety = min(self.top_k, len(scores))  # Safety check
+        linear_damp = self.linear_damp
+        topk_values_indices = sorted(
+            range(len(scores)), key=lambda x: scores[x], reverse=True
+        )[:top_k_safety]
+        self.token_num += 1
+        return [
+            score * linear_damp if idx in topk_values_indices else score
+            for idx, score in enumerate(scores)
+        ]
+
+    @property
+    def linear_damp(self) -> float:
+        ratio = (
+            1.0
+            if self.damp_ramp_tokens == 0
+            else min(self.token_num / self.damp_ramp_tokens, 1.0)
+        )
+        return (
+            self.damp_initial + ratio * (self.damp - self.damp_initial)
+            if ratio < 1.0
+            else self.damp
+        )