Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev update (23.8.17.) #4

Merged
merged 15 commits into from
Aug 17, 2023
Merged
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ repositories/
.venv/
.vscode/
.test-venv/
PRIVATE_*
.temp/
PRIVATE_*
private/*
14 changes: 7 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
### Approximately 5 ~ 10 minutes to build

# Select the required CUDA version.
ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
FROM nvidia/cuda:${CUDA_IMAGE}
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
ENV PYTHON_VERSION="3.11.4"
ENV PYTHON_VERSION_SHORT="3.11"
ENV HOST 0.0.0.0
ENV PORT=8000

# Copy the necessary files.
COPY requirements.txt /app/requirements.txt
COPY pyproject.toml /app/pyproject.toml
COPY llama_api /app/llama_api
COPY pyproject.toml /app/pyproject.toml
COPY requirements.txt /app/requirements.txt
COPY main.py /app/main.py
COPY model_definitions.py /app/model_definitions.py

# Install the necessary applications, and then install Python.
# Then, install the necessary Python packages(Dependencies).
Expand Down Expand Up @@ -41,7 +40,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& apt-get clean \
&& rm -rf /tmp/* \
&& cd /app \
&& python3 -m llama_api.server.app_settings --force-cuda --install-pkgs
&& python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
# Need to skip complie because GPU access to host is not supported when building image.

# Set the working directory and start the server.
WORKDIR /app
Expand Down
32 changes: 32 additions & 0 deletions build_shared_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# flake8: noqa

from argparse import ArgumentParser
from llama_api.utils.llama_cpp import (
build_shared_lib,
CPU_ARGS, # Only use CPU
METAL_ARGS, # Only use Metal (MacOS)
CUBLAS_ARGS, # Only use CUBLAS (Nvidia)
)
from os import environ

ARGS = {
"CPU": CPU_ARGS,
"METAL": METAL_ARGS,
"CUBLAS": CUBLAS_ARGS,
"CUDA": CUBLAS_ARGS,
}

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"-b",
"--build_type",
type=lambda s: str(s).upper(),
default="CPU",
choices=["CPU", "METAL", "CUBLAS", "CUDA"],
help="Build type",
)

environ["FORCE_CMAKE"] = "1"
environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type]
build_shared_lib()
48 changes: 48 additions & 0 deletions docker-compose.persistent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
version: '3.8'

volumes:
llama-api-models:

services:
llama-api:
image: cosogi/llama-api:230816
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- FORCE_CUDA=1
- LLAMA_API_MAX_WORKERS=1
- LLAMA_API_API_KEY=
volumes:
- llama-api-models:/app/models
- ./model_definitions.py:/app/model_definitions.py
- ./main.py:/app/main.py
ports:
- 8000:8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]


# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - LLAMA_API_MAX_WORKERS=1
# - LLAMA_API_API_KEY=
# volumes:
# - llama-api-models:/app/models
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
48 changes: 25 additions & 23 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ version: '3'

services:
llama-api:
image: cosogi/llama-api:230730
image: cosogi/llama-api:230816
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- MAX_WORKERS=1
- FORCE_CUDA=1
- LLAMA_API_MAX_WORKERS=1
- LLAMA_API_API_KEY=
volumes:
- ./models:/app/models
- ./llama_api:/app/llama_api
Expand All @@ -23,24 +25,24 @@ services:
capabilities: [gpu]

# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - MAX_WORKERS=1
# volumes:
# - ./models:/app/models
# - ./llama_api:/app/llama_api
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# - ./requirements.txt:/app/requirements.txt
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - MAX_WORKERS=1
# volumes:
# - ./models:/app/models
# - ./llama_api:/app/llama_api
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# - ./requirements.txt:/app/requirements.txt
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
19 changes: 19 additions & 0 deletions llama_api/logits/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List

if TYPE_CHECKING:
import torch as pytorch


class BaseLogitProcessor(ABC):
@abstractmethod
def with_torch(
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
) -> "pytorch.Tensor":
"""Process logits with PyTorch tensors."""

@abstractmethod
def without_torch(
self, input_ids: List[int], scores: List[float]
) -> List[float]:
"""Process logits with Python lists."""
86 changes: 86 additions & 0 deletions llama_api/logits/bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from typing import (
TYPE_CHECKING,
Callable,
Dict,
List,
Optional,
)

from ..utils.logger import ApiLogger
from .base import BaseLogitProcessor

if TYPE_CHECKING:
import torch as pytorch

logger = ApiLogger(__name__)

try:
import tiktoken

openai_decoder = tiktoken.get_encoding("cl100k_base").decode
except Exception as e:
logger.warning(
"Could not load tiktoken, which is required for OpenAI GPT models. "
f"Please `pip install tiktoken` to use the OpenAI encoder: {e}"
)
openai_decoder: Optional[Callable[[List[int]], str]] = None


class LogitBiasProcessor(BaseLogitProcessor):
"""Create a logit bias processor to bias the logit scores."""

def __init__(
self,
logit_bias: Dict[str, float],
encoder: Callable[[str], List[int]],
is_openai: bool = False,
):
"""Create a logit bias processor to bias the logit scores."""

global openai_decoder

biases = {} # type: Dict[int, float]
for id_or_token, bias in logit_bias.items():
is_digit = id_or_token.isdigit()

if is_digit and is_openai and openai_decoder is not None:
# If we have an OpenAI id, we need to convert it to a token
# and then encode the token to get the ids
for id in encoder(openai_decoder([int(id_or_token)])):
if abs(bias) > abs(biases.get(id, 0.0)):
biases[id] = bias
elif is_digit:
# If we have a digit, we can just use it directly
biases[int(id_or_token)] = bias
else:
# Otherwise, we need to encode the token and use the ids
for id in encoder(id_or_token):
if abs(bias) > abs(biases.get(id, 0.0)):
biases[id] = bias

self._biases = biases
self._bias_tensor = None

def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor":
if self._bias_tensor is None:
import torch

self._bias_tensor = torch.zeros(
scores.shape[-1], dtype=scores.dtype, device=scores.device
)
for id, bias in self._biases.items():
self._bias_tensor[id] = bias

return self._bias_tensor

def with_torch(
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
) -> "pytorch.Tensor":
return scores + self._get_bias_tensor(scores)

def without_torch(
self, input_ids: List[int], scores: List[float]
) -> List[float]:
for id, bias in self._biases.items():
scores[id] += bias
return scores
78 changes: 78 additions & 0 deletions llama_api/logits/muse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# flake8: noqa
from typing import TYPE_CHECKING, List, Tuple

from .base import BaseLogitProcessor

if TYPE_CHECKING:
import torch as pytorch


class MuseLogitProcessor(BaseLogitProcessor):
"""Performs dampening of the k highest probability elements.

Args:
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
damp (`float`, *optional*, defaults to 0.98):
How much less likely should the top_k most likely tokens be made. If set to 0, they become impossible.
"""

def __init__(
self,
top_k: int = 3,
damp: float = 0.9,
damp_initial: float = 1.0,
damp_ramp_tokens: int = 32,
min_tokens_to_keep: int = 1,
):
if not isinstance(top_k, int) or top_k <= 0:
raise ValueError(
"`top_k` has to be a strictly positive integer, "
f"but is {top_k}"
)

self.top_k = max(top_k, min_tokens_to_keep)
self.damp = damp
self.damp_initial = damp_initial
self.damp_ramp_tokens = damp_ramp_tokens
self.token_num = 0

def with_torch(
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
) -> "pytorch.Tensor":
import torch

top_k_safety = min(self.top_k, scores.size(-1)) # Safety check
linear_damp = self.linear_damp
topk_values, topk_indices = torch.topk(
scores, top_k_safety, dim=-1
) # Specify the dimension
self.token_num += 1
return scores.scatter_(-1, topk_indices, topk_values * linear_damp)

def without_torch(
self, input_ids: List[int], scores: List[float]
) -> List[float]:
top_k_safety = min(self.top_k, len(scores)) # Safety check
linear_damp = self.linear_damp
topk_values_indices = sorted(
range(len(scores)), key=lambda x: scores[x], reverse=True
)[:top_k_safety]
self.token_num += 1
return [
score * linear_damp if idx in topk_values_indices else score
for idx, score in enumerate(scores)
]

@property
def linear_damp(self) -> float:
ratio = (
1.0
if self.damp_ramp_tokens == 0
else min(self.token_num / self.damp_ramp_tokens, 1.0)
)
return (
self.damp_initial + ratio * (self.damp - self.damp_initial)
if ratio < 1.0
else self.damp
)
Loading
Loading