From 668faeb733dd0fae5093a5a5f7ba16ef6eba87e5 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 13 Aug 2023 01:07:22 +0900
Subject: [PATCH 01/15] Added logit processors

---
 llama_api/logits/base.py         |  19 ++
 llama_api/logits/bias.py         |  56 ++++
 llama_api/logits/muse.py         |  78 +++++
 llama_api/mixins/logits.py       |  34 ++
 llama_api/mixins/prompt_utils.py |  21 +-
 llama_api/modules/base.py        |  25 +-
 llama_api/modules/exllama.py     | 550 +++++++++++++++++++++----------
 llama_api/modules/llama_cpp.py   | 103 ++----
 llama_api/schemas/api.py         |  34 +-
 llama_api/schemas/models.py      |   8 +
 llama_api/server/pools/llama.py  |  18 +-
 llama_api/utils/errors.py        |   4 +-
 llama_api/utils/process_pool.py  |  11 +
 llama_api/utils/system.py        |   2 +-
 14 files changed, 669 insertions(+), 294 deletions(-)
 create mode 100644 llama_api/logits/base.py
 create mode 100644 llama_api/logits/bias.py
 create mode 100644 llama_api/logits/muse.py
 create mode 100644 llama_api/mixins/logits.py

diff --git a/llama_api/logits/base.py b/llama_api/logits/base.py
new file mode 100644
index 0000000..f7449a7
--- /dev/null
+++ b/llama_api/logits/base.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List
+
+if TYPE_CHECKING:
+    import torch as pytorch
+
+
+class BaseLogitProcessor(ABC):
+    @abstractmethod
+    def with_torch(
+        self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
+    ) -> "pytorch.Tensor":
+        """Process logits with PyTorch tensors."""
+
+    @abstractmethod
+    def without_torch(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        """Process logits with Python lists."""
diff --git a/llama_api/logits/bias.py b/llama_api/logits/bias.py
new file mode 100644
index 0000000..ebdae0f
--- /dev/null
+++ b/llama_api/logits/bias.py
@@ -0,0 +1,56 @@
+from typing import TYPE_CHECKING, Callable, Dict, List, Literal, Optional
+
+from .base import BaseLogitProcessor
+
+if TYPE_CHECKING:
+    import torch as pytorch
+
+
+class LogitBiasProcessor(BaseLogitProcessor):
+    """Create a logit bias processor to bias the logit scores."""
+
+    def __init__(
+        self,
+        logit_bias: Dict[str, float],
+        logit_bias_type: Optional[Literal["input_ids", "tokens"]],
+        encoder: Callable[[str], List[int]],
+    ):
+        if logit_bias_type is None:
+            logit_bias_type = "input_ids"
+
+        to_bias = {}  # type: Dict[int, float]
+        if logit_bias_type == "input_ids":
+            for input_id_string, score in logit_bias.items():
+                to_bias[int(input_id_string)] = score
+
+        elif logit_bias_type == "tokens":
+            for token, score in logit_bias.items():
+                for input_id in encoder(token):
+                    to_bias[input_id] = score
+
+        self._to_bias = to_bias
+        self._bias_tensor = None
+
+    def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor":
+        if self._bias_tensor is None:
+            import torch
+
+            self._bias_tensor = torch.zeros(
+                scores.shape[-1], dtype=scores.dtype, device=scores.device
+            )
+            for idx, value in self._to_bias.items():
+                self._bias_tensor[idx] = value
+
+        return self._bias_tensor
+
+    def with_torch(
+        self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
+    ) -> "pytorch.Tensor":
+        return scores + self._get_bias_tensor(scores)
+
+    def without_torch(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        for id, biased_score in self._to_bias.items():
+            scores[id] += biased_score
+        return scores
diff --git a/llama_api/logits/muse.py b/llama_api/logits/muse.py
new file mode 100644
index 0000000..8c0414c
--- /dev/null
+++ b/llama_api/logits/muse.py
@@ -0,0 +1,78 @@
+# flake8: noqa
+from typing import TYPE_CHECKING, List, Tuple
+
+from .base import BaseLogitProcessor
+
+if TYPE_CHECKING:
+    import torch as pytorch
+
+
+class MuseLogitProcessor(BaseLogitProcessor):
+    """Performs dampening of the k highest probability elements.
+
+    Args:
+        top_k (`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        damp (`float`, *optional*, defaults to 0.98):
+            How much less likely should the top_k most likely tokens be made. If set to 0, they become impossible.
+    """
+
+    def __init__(
+        self,
+        top_k: int = 3,
+        damp: float = 0.9,
+        damp_initial: float = 1.0,
+        damp_ramp_tokens: int = 32,
+        min_tokens_to_keep: int = 1,
+    ):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(
+                "`top_k` has to be a strictly positive integer, "
+                f"but is {top_k}"
+            )
+
+        self.top_k = max(top_k, min_tokens_to_keep)
+        self.damp = damp
+        self.damp_initial = damp_initial
+        self.damp_ramp_tokens = damp_ramp_tokens
+        self.token_num = 0
+
+    def with_torch(
+        self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
+    ) -> "pytorch.Tensor":
+        import torch
+
+        top_k_safety = min(self.top_k, scores.size(-1))  # Safety check
+        linear_damp = self.linear_damp
+        topk_values, topk_indices = torch.topk(
+            scores, top_k_safety, dim=-1
+        )  # Specify the dimension
+        self.token_num += 1
+        return scores.scatter_(-1, topk_indices, topk_values * linear_damp)
+
+    def without_torch(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        top_k_safety = min(self.top_k, len(scores))  # Safety check
+        linear_damp = self.linear_damp
+        topk_values_indices = sorted(
+            range(len(scores)), key=lambda x: scores[x], reverse=True
+        )[:top_k_safety]
+        self.token_num += 1
+        return [
+            score * linear_damp if idx in topk_values_indices else score
+            for idx, score in enumerate(scores)
+        ]
+
+    @property
+    def linear_damp(self) -> float:
+        ratio = (
+            1.0
+            if self.damp_ramp_tokens == 0
+            else min(self.token_num / self.damp_ramp_tokens, 1.0)
+        )
+        return (
+            self.damp_initial + ratio * (self.damp - self.damp_initial)
+            if ratio < 1.0
+            else self.damp
+        )
diff --git a/llama_api/mixins/logits.py b/llama_api/mixins/logits.py
new file mode 100644
index 0000000..75867a1
--- /dev/null
+++ b/llama_api/mixins/logits.py
@@ -0,0 +1,34 @@
+from typing import Callable, List
+
+from ..logits.base import BaseLogitProcessor
+from ..logits.bias import LogitBiasProcessor
+from ..logits.muse import MuseLogitProcessor
+from ..schemas.api import TextGenerationSettings
+
+
+class LogitsMixin:
+    @staticmethod
+    def get_logit_processors(
+        settings: TextGenerationSettings, encoder: Callable[[str], List[int]]
+    ) -> List[BaseLogitProcessor]:
+        logit_processors: List[BaseLogitProcessor] = []
+        if settings.muse:
+            logit_processors.append(
+                MuseLogitProcessor(
+                    top_k=3,
+                    damp=0.9,
+                    damp_initial=1.0,
+                    damp_ramp_tokens=32,
+                    min_tokens_to_keep=1,
+                )
+            )
+        if settings.logit_bias is not None:
+            logit_processors.insert(
+                0,
+                LogitBiasProcessor(
+                    logit_bias=settings.logit_bias,
+                    logit_bias_type=settings.logit_bias_type,
+                    encoder=encoder,
+                ),
+            )
+        return logit_processors
diff --git a/llama_api/mixins/prompt_utils.py b/llama_api/mixins/prompt_utils.py
index adc6194..0b19dec 100644
--- a/llama_api/mixins/prompt_utils.py
+++ b/llama_api/mixins/prompt_utils.py
@@ -61,18 +61,23 @@ def convert_messages_into_prompt(
         return chat_history + f"### {ai_input_role}:"
 
     @staticmethod
-    def is_possible_to_generate_stops(
-        decoded_text: str, stops: List[str]
-    ) -> bool:
+    def is_possible_to_generate_stops(text: str, stops: List[str]) -> bool:
         """A helper method to check if
         the decoded text contains any of the stop tokens."""
 
         for stop in stops:
-            if stop in decoded_text or any(
-                [
-                    decoded_text.endswith(stop[: i + 1])
-                    for i in range(len(stop))
-                ]
+            if stop in text or any(
+                [text.endswith(stop[: i + 1]) for i in range(len(stop))]
             ):
                 return True
         return False
+
+    @staticmethod
+    def raise_for_token_limit(prompt_tokens: int, context_window: int) -> None:
+        """A helper method to raise an error if the number of tokens
+        requested for completion exceeds the context window."""
+        if prompt_tokens >= context_window:
+            raise ValueError(
+                f"Requested tokens ({prompt_tokens}) exceed "
+                f"context window of {context_window}"
+            )
diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
index fffc588..6bd6286 100644
--- a/llama_api/modules/base.py
+++ b/llama_api/modules/base.py
@@ -2,8 +2,10 @@
 from dataclasses import dataclass
 from typing import Any, Iterator, List, TypeVar
 
-from ..mixins.prompt_utils import PromptUtilsMixin
+from llama_api.mixins.logits import LogitsMixin
+
 from ..mixins.interrupt import InterruptMixin
+from ..mixins.prompt_utils import PromptUtilsMixin
 from ..schemas.api import (
     APIChatMessage,
     ChatCompletion,
@@ -24,7 +26,9 @@ class BaseLLMModel:
     max_total_tokens: int = 2048
 
 
-class BaseCompletionGenerator(ABC, PromptUtilsMixin, InterruptMixin):
+class BaseCompletionGenerator(
+    ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin
+):
     """Base class for all completion generators."""
 
     @abstractmethod
@@ -38,14 +42,12 @@ def from_pretrained(
         cls, llm_model: "BaseLLMModel"
     ) -> "BaseCompletionGenerator":
         """Load a pretrained model into RAM."""
-        ...
 
     @abstractmethod
     def generate_completion(
         self, prompt: str, settings: TextGenerationSettings
     ) -> Completion:
         """Generate a completion for a given prompt."""
-        ...
 
     @abstractmethod
     def generate_completion_with_streaming(
@@ -53,14 +55,12 @@ def generate_completion_with_streaming(
     ) -> Iterator[CompletionChunk]:
         """Generate a completion for a given prompt,
         yielding chunks of text as they are generated."""
-        ...
 
     @abstractmethod
     def generate_chat_completion(
         self, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> ChatCompletion:
         """Generate a completion for a given prompt."""
-        ...
 
     @abstractmethod
     def generate_chat_completion_with_streaming(
@@ -68,20 +68,25 @@ def generate_chat_completion_with_streaming(
     ) -> Iterator[ChatCompletionChunk]:
         """Generate a completion for a given prompt,
         yielding chunks of text as they are generated."""
-        ...
+
+    @abstractmethod
+    def encode(self, text: str, **kwargs: Any) -> List[int]:
+        """Encode a text string into a list of token IDs."""
+
+    @abstractmethod
+    def decode(self, ids: List[int], **kwargs: Any) -> str:
+        """Decode a list of token IDs into a text string."""
 
     @property
     @abstractmethod
     def llm_model(self) -> "BaseLLMModel":
         """The LLM model used by this generator."""
-        ...
 
 
 class BaseEmbeddingGenerator(ABC):
     @abstractmethod
     def __del__(self):
         """Clean up resources."""
-        ...
 
     @classmethod
     @abstractmethod
@@ -96,10 +101,8 @@ def generate_embeddings(
         **kwargs: Any,
     ) -> List[List[float]]:
         """Generate embeddings for a list of texts."""
-        ...
 
     @property
     @abstractmethod
     def model_name(self) -> str:
         """Identifier for the model used by this generator."""
-        ...
diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index 2125bf8..7fb0a0f 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -1,10 +1,21 @@
 """Wrapper for exllama to generate text completions."""
-from contextlib import contextmanager
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Iterator, List, Optional
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
 
-from torch import IntTensor, Tensor, cuda
+from torch import IntTensor, Tensor, cuda, version
+from torch.nn.functional import log_softmax
 
+from ..logits.base import BaseLogitProcessor
 from ..schemas.models import ExllamaModel
 from ..utils.completions import (
     make_chat_completion,
@@ -15,6 +26,7 @@
 from ..utils.dependency import import_repository
 from ..utils.logger import ApiLogger
 from ..utils.path import resolve_model_path_to_posix
+from ..utils.system import deallocate_memory
 from .base import BaseCompletionGenerator
 
 with import_repository(
@@ -35,28 +47,32 @@
         TextGenerationSettings,
     )
 
-logger = ApiLogger("||🦙 exllama.generator||")
 assert cuda.is_available(), "CUDA must be available to use ExLlama."
+logger = ApiLogger(__name__)
+_stop_checker = BaseCompletionGenerator.is_possible_to_generate_stops
 
 
-def _encode(tokenizer: ExLlamaTokenizer, text: str) -> Tensor:
-    """Encode a text string into a tensor."""
-    result = tokenizer.encode(text)
-    if isinstance(result, tuple):
-        return result[0]
-    else:
-        return result
+def _make_config(
+    model_folder_path: Path, llm_model: "ExllamaModel"
+) -> ExLlamaConfig:
+    """Create a config object for the ExLlama model."""
 
+    # Find the model checkpoint
+    model_file_found: List[Path] = []
+    for ext in (".safetensors", ".pt", ".bin"):
+        model_file_found.extend(model_folder_path.glob(f"*{ext}"))
+        if model_file_found:
+            if len(model_file_found) > 1:
+                logger.warning(
+                    f"More than one {ext} model has been found. "
+                    "The last one will be selected. It could be wrong."
+                )
 
-def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
-    """Create a config object for the ExLlama model."""
-    model_folder_path = Path(
-        resolve_model_path_to_posix(
-            llm_model.model_path,
-            default_relative_directory="models/gptq",
-        ),
-    )
-    config = ExLlamaConfig((model_folder_path / "config.json").as_posix())
+            break
+    if not model_file_found:
+        raise FileNotFoundError(
+            f"No model has been found in {model_folder_path}."
+        )
 
     # Find the model checkpoint
     model_file_found: List[Path] = []
@@ -74,6 +90,8 @@ def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
         raise FileNotFoundError(
             f"No model has been found in {model_folder_path}."
         )
+
+    config = ExLlamaConfig((model_folder_path / "config.json").as_posix())
     config.model_path = model_file_found[-1].as_posix()  # type: ignore
     config.max_seq_len = llm_model.max_total_tokens
     config.max_input_len = llm_model.max_total_tokens
@@ -91,184 +109,299 @@ def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
     config.matmul_fused_remap = llm_model.matmul_fused_remap
     config.silu_no_half2 = llm_model.silu_no_half2
     config.concurrent_streams = llm_model.concurrent_streams
+    if llm_model.alpha_value is not None:
+        config.alpha_value = llm_model.alpha_value
+        config.calculate_rotary_embedding_base()
+    if version.hip:
+        config.rmsnorm_no_half2 = True
+        config.rope_no_half2 = True
+        config.matmul_no_half2 = True
+        config.silu_no_half2 = True
     return config
 
 
-def _make_tokenizer(llm_model: "ExllamaModel") -> ExLlamaTokenizer:
-    """Create a tokenizer object for the ExLlama model."""
-    model_folder_path = Path(
-        resolve_model_path_to_posix(
-            llm_model.model_path,
-            default_relative_directory="models/gptq",
-        ),
+def _apply_settings_to_generator(
+    cg: "ExllamaCompletionGenerator",
+    settings: "TextGenerationSettings",
+) -> ExLlamaGenerator:
+    """Apply the settings to the generator."""
+    # Make sure that the batch size is correct
+    required_batch_size = 1 if settings.guidance_scale == 1 else 2
+    cache_batch_size = cg.cache.batch_size  # type: int
+    if cache_batch_size != required_batch_size:
+        cg._cache = None
+        deallocate_memory(cg._cache)
+        cg._cache = ExLlamaCache(cg._model, batch_size=required_batch_size)
+        cg._generator = ExLlamaGenerator(
+            model=cg._model, tokenizer=cg._tokenizer, cache=cg._cache
+        )
+    # Temperature cannot be 0.0, so we use a very small value instead.
+    # 0.0 will cause a division by zero error.
+    generator = cg.generator
+    generator.settings.temperature = settings.temperature or 0.01
+    generator.settings.top_p = settings.top_p
+    generator.settings.top_k = settings.top_k
+    generator.settings.typical = settings.typical_p
+    generator.settings.token_repetition_penalty_max = settings.repeat_penalty
+    generator.settings.token_repetition_penalty_sustain = (
+        -1
+        if settings.repetition_penalty_range <= 0
+        else settings.repetition_penalty_range
+    )
+    disallowed_tokens = (
+        [generator.tokenizer.eos_token_id] if settings.ban_eos_token else None
     )
-    return ExLlamaTokenizer(
-        (model_folder_path / "tokenizer.model").as_posix(),
+    generator.disallow_tokens(disallowed_tokens)
+    return generator
+
+
+def _gen_single_token_with_cfg(
+    generator: ExLlamaGenerator, mask: Tensor, cfg_alpha: float
+) -> int:
+    logits = generator.model.forward(
+        generator.sequence[:, -1:], cache=generator.cache, input_mask=mask
+    )  # type: Tensor  # type: ignore
+    generator.apply_rep_penalty(logits)
+    probs = log_softmax(logits, dim=-1)
+    token, _ = generator.sample_current(
+        cfg_alpha * probs[0] + (1 - cfg_alpha) * probs[1]
     )
+    generator.gen_accept_token(token.repeat(2, 1))
+    return int(token.item())
+
+
+def _gen_single_token_without_cfg(
+    generator: ExLlamaGenerator,
+    initial_len: int,
+    constraints: Optional[Tensor] = None,
+    mask: Optional[Tensor] = None,
+    logit_processors: Optional[Iterable[BaseLogitProcessor]] = None,
+) -> int:
+    generator.end_beam_search()
+
+    # Simple sampling case:
+    if generator.sequence is not None:
+        logits = generator.model.forward(
+            generator.sequence[:, -1:],
+            generator.cache,
+            lora=generator.lora,
+            input_mask=mask,
+        )  # type: Tensor  # type: ignore
+        generator.apply_rep_penalty(logits)
+        logits[:, :, generator.tokenizer.bos_token_id] = -10000.0
+
+        if logit_processors is not None:
+            input_ids = generator.sequence[0][initial_len:]
+            for logit_processor in logit_processors:
+                logits = logit_processor.with_torch(input_ids, logits)
+
+        if constraints is not None:
+            for constraint in constraints:
+                logits[:, :, constraint] += 10000.0
+            logits[:, :, :] -= 10000.0
+
+        token, _ = generator.batched_sample(
+            logits,
+            generator.settings.temperature,
+            generator.settings.top_k,
+            generator.settings.top_p,
+            generator.settings.min_p + 0.01
+            if constraints is not None
+            else 0.0,
+            generator.settings.typical,
+        )
+
+    else:
+        if constraints is not None:
+            token = constraints[0]
+        else:
+            token = Tensor([[generator.tokenizer.bos_token_id]]).long()
+
+    generator.gen_accept_token(token)
+    return int(token.item())
+
+
+def _generator(
+    cg: "ExllamaCompletionGenerator",
+    settings: "TextGenerationSettings",
+    stops: List[str],
+    cfg_mask: Optional[Tensor] = None,
+) -> Iterator[str]:
+    IdToPiece = cg.tokenizer.tokenizer.IdToPiece
+    decoder = cg.tokenizer.decode
+    generator = cg.generator
+
+    cfg_alpha = settings.guidance_scale  # type: float
+    initial_len = generator.sequence[0].shape[0]  # type: int
+    eos_token_id = generator.tokenizer.eos_token_id  # type: int
+    has_leading_space = False  # type: bool
+    text_cursor = 0  # type: int
+    n_tokens = 0  # type: int
+    logit_processors = (
+        [
+            processor
+            for processor in BaseCompletionGenerator.get_logit_processors(
+                settings=settings,
+                encoder=cg.encode,
+            )
+        ]
+        if cfg_mask is None
+        else None
+    )  # type: Optional[Iterable[BaseLogitProcessor]]
+    for n_tokens in range(1, settings.max_tokens + 1):
+        if cg.is_interrupted:
+            break  # the generator was interrupted
+
+        # Predict the next token id
+        if cfg_mask is not None:
+            token_id = _gen_single_token_with_cfg(
+                generator, mask=cfg_mask, cfg_alpha=cfg_alpha
+            )
+        else:
+            token_id = _gen_single_token_without_cfg(
+                generator,
+                initial_len=initial_len,
+                logit_processors=logit_processors or None,
+            )
+        if cg.is_interrupted or token_id == eos_token_id:
+            break
+
+        # Yield the text piece
+        if n_tokens == 1:
+            has_leading_space = IdToPiece(token_id).startswith("▁")
+        decoded_text = (
+            " " + str(decoder(generator.sequence[0][initial_len:]))
+            if has_leading_space
+            else str(decoder(generator.sequence[0][initial_len:]))
+        )
+        text_piece = decoded_text[text_cursor:]
+        if "�" in text_piece:  # Decode error when decoding multi-byte char
+            continue
+        if _stop_checker(text_piece, stops=stops):  # Stop token found maybe
+            if any(stop in decoded_text for stop in stops):
+                break  # Stop token found
+            continue
+        yield text_piece
+        text_cursor += len(text_piece)
+    # End of generation
+    cg._completion_status[settings.completion_id] = n_tokens
+
+
+def _generate_text_with_streaming(
+    cg: "ExllamaCompletionGenerator",
+    prompt: str,
+    settings: "TextGenerationSettings",
+) -> Iterator[str]:
+    try:
+        # Make sure that the stop token is a list
+        if isinstance(settings.stop, str):
+            stops = [settings.stop]  # type: List[str]
+        elif isinstance(settings.stop, list):
+            stops = settings.stop
+        else:
+            stops = []
+
+        # Apply the settings to the generator
+        generator = _apply_settings_to_generator(cg, settings=settings)
+
+        # Start the generator
+        if settings.guidance_scale == 1:
+            ids = _encode(cg.tokenizer, prompt)
+            mask = None  # type: Optional[Tensor]
+            generator.end_beam_search()
+            generator.gen_begin_reuse(ids)
+        else:
+            ids, mask = _encode(
+                cg.tokenizer,
+                [prompt, settings.negative_prompt or ""],
+                return_mask=True,
+            )
+            generator.gen_begin(ids, mask=mask)
+        cg.raise_for_token_limit(
+            prompt_tokens=ids.shape[-1],
+            context_window=cg.llm_model.max_total_tokens,
+        )
+        yield from _generator(
+            cg, cfg_mask=mask, settings=settings, stops=stops
+        )
+    except Exception as e:
+        logger.exception(e)
+        raise e
 
 
 class ExllamaCompletionGenerator(BaseCompletionGenerator):
-    config: Optional[ExLlamaConfig] = None
-    model: Optional[ExLlama] = None
-    cache: Optional[ExLlamaCache] = None
-    tokenizer: Optional[ExLlamaTokenizer] = None
-    generator: Optional[ExLlamaGenerator] = None
+    _config: Optional[ExLlamaConfig] = None
+    _model: Optional[ExLlama] = None
+    _cache: Optional[ExLlamaCache] = None
+    _tokenizer: Optional[ExLlamaTokenizer] = None
+    _generator: Optional[ExLlamaGenerator] = None
     _llm_model: Optional["ExllamaModel"] = None
     _completion_status: Dict[
         str, int
     ] = {}  # key: completion_id, value: number of completion tokens
 
-    def __del__(self) -> None:
-        if self.model is not None:
-            self.model.free_unmanaged()
-            del self.model
-            self.model = None
-            logger.info("🗑️ ExllamaCompletionGenerator model deleted")
-        if self.tokenizer is not None:
-            getattr(self.tokenizer, "__del__", lambda: None)()
-            del self.tokenizer
-            self.tokenizer = None
-            logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted")
-        if self.cache is not None:
-            getattr(self.cache, "__del__", lambda: None)()
-            del self.cache
-            self.cache = None
-            logger.info("🗑️ ExllamaCompletionGenerator cache deleted")
-
     @property
     def llm_model(self) -> "ExllamaModel":
         assert self._llm_model is not None
         return self._llm_model
 
+    @property
+    def generator(self) -> ExLlamaGenerator:
+        assert self._generator is not None, "Generator is not initialized."
+        return self._generator
+
+    @property
+    def tokenizer(self) -> ExLlamaTokenizer:
+        assert self._tokenizer is not None, "Tokenizer is not initialized."
+        return self._tokenizer
+
+    @property
+    def cache(self) -> ExLlamaCache:
+        assert self._cache is not None, "Cache is not initialized."
+        return self._cache
+
+    @property
+    def model(self) -> ExLlama:
+        assert self._model is not None, "Model is not initialized."
+        return self._model
+
+    @property
+    def config(self) -> ExLlamaConfig:
+        assert self._config is not None, "Config is not initialized."
+        return self._config
+
     @classmethod
     def from_pretrained(
         cls, llm_model: "ExllamaModel"
     ) -> "ExllamaCompletionGenerator":
         result = cls()
-        result.config = _make_config(llm_model)
-        result.tokenizer = _make_tokenizer(llm_model)
-        result.model = ExLlama(result.config)
-        result.cache = ExLlamaCache(result.model)
-        result.generator = None
-        result._llm_model = llm_model
-        return result
-
-    @contextmanager
-    def _generator_context_manager(
-        self, prompt: str, settings: "TextGenerationSettings"
-    ) -> Iterator[ExLlamaGenerator]:
-        """Make a generator object for the ExLlama model."""
-        assert self.model is not None, "Model is not initialized."
-        assert self.tokenizer is not None, "Tokenizer is not initialized."
-        assert self.cache is not None, "Cache is not initialized."
-
-        generator = ExLlamaGenerator(
-            model=self.model,
-            tokenizer=self.tokenizer,
-            cache=self.cache,
+        model_folder_path = Path(
+            resolve_model_path_to_posix(
+                llm_model.model_path,
+                default_relative_directory="models/gptq",
+            )
         )
-        # Temperature cannot be 0.0, so we use a very small value instead.
-        # 0.0 will cause a division by zero error.
-        generator.settings.temperature = settings.temperature or 0.01
-        generator.settings.top_p = settings.top_p
-        generator.settings.top_k = settings.top_k
-        generator.settings.typical = settings.typical_p
-        generator.settings.token_repetition_penalty_max = (
-            settings.repeat_penalty
+        result._config = _make_config(model_folder_path, llm_model)
+        result._tokenizer = ExLlamaTokenizer(
+            (model_folder_path / "tokenizer.model").as_posix()
         )
-        if (
-            settings.ban_eos_token
-            and generator.tokenizer.eos_token_id is not None
-        ):
-            generator.disallow_tokens([generator.tokenizer.eos_token_id])
-
-        generator.end_beam_search()
-        generator.gen_begin_reuse(generator.tokenizer.encode(prompt))
-        yield generator
-        del generator
-
-    def _generate_text(
-        self, prompt: str, settings: "TextGenerationSettings"
-    ) -> str:
-        return "".join(
-            self._generate_text_with_streaming(prompt, settings=settings)
-        )
-
-    def _generate_text_with_streaming(
-        self, prompt: str, settings: "TextGenerationSettings"
-    ) -> Iterator[str]:
-        assert (
-            self.model is not None
-            and self.tokenizer is not None
-            and self.cache is not None
+        result._model = ExLlama(result._config)
+        result._cache = ExLlamaCache(result._model)
+        result._generator = ExLlamaGenerator(
+            result._model, result._tokenizer, result._cache
         )
-
-        # Make sure that the stop token is a list
-        if isinstance(settings.stop, str):
-            stops = [settings.stop]
-        elif isinstance(settings.stop, list):
-            stops = settings.stop
-        else:
-            stops = []
-
-        with self._generator_context_manager(
-            prompt, settings=settings
-        ) as generator:
-            # Start generation
-            initial_len = generator.sequence[0].shape[0]
-            has_leading_space: bool = False
-            text_cursor: int = 0
-            n_completion_tokens: int = 0
-
-            for n_completion_tokens in range(1, settings.max_tokens + 1):
-                if self.is_interrupted:
-                    return  # the generator was interrupted
-                token = generator.gen_single_token()
-                if self.is_interrupted:
-                    return  # the generator was interrupted
-                if token.item() == generator.tokenizer.eos_token_id:
-                    return
-                if (
-                    n_completion_tokens == 0
-                    and generator.tokenizer.tokenizer.IdToPiece(
-                        int(token)
-                    ).startswith("▁")
-                ):
-                    has_leading_space = True
-
-                decoded_text = str(
-                    generator.tokenizer.decode(
-                        generator.sequence[0][initial_len:]
-                    )
-                )
-                if has_leading_space:
-                    decoded_text = " " + decoded_text
-                if self.is_possible_to_generate_stops(
-                    decoded_text, stops=stops
-                ):
-                    for stop in stops:
-                        if stop in decoded_text:
-                            return
-                    continue
-                text_piece = decoded_text[text_cursor:]
-                if "�" in text_piece:
-                    continue
-                yield text_piece
-                text_cursor += len(text_piece)
-            self._completion_status[
-                settings.completion_id
-            ] = n_completion_tokens
+        result._llm_model = llm_model
+        return result
 
     def generate_completion_with_streaming(
         self, prompt: str, settings: "TextGenerationSettings"
     ) -> Iterator["CompletionChunk"]:
-        assert self.config is not None and self.tokenizer is not None
         completion_id: str = settings.completion_id
         model_path: str = str(self.config.model_path)
         last_token: Optional[str] = None
         generated_text: str = ""
-        for token in self._generate_text_with_streaming(
-            prompt, settings=settings
+        for token in _generate_text_with_streaming(
+            self, prompt=prompt, settings=settings
         ):
             generated_text += token
             if last_token is not None:
@@ -285,7 +418,8 @@ def generate_completion_with_streaming(
             text=last_token if last_token is not None else "",
             finish_reason="length"
             if self._completion_status.get(
-                completion_id, _encode(self.tokenizer, generated_text).shape[1]
+                completion_id,
+                _encode(self.tokenizer, generated_text).shape[1],
             )
             >= settings.max_tokens
             else "stop",
@@ -294,9 +428,12 @@ def generate_completion_with_streaming(
     def generate_completion(
         self, prompt: str, settings: "TextGenerationSettings"
     ) -> "Completion":
-        assert self.tokenizer is not None and self.config is not None
         completion_id: str = settings.completion_id
-        generated_text: str = self._generate_text(prompt, settings=settings)
+        generated_text: str = "".join(
+            _generate_text_with_streaming(
+                self, prompt=prompt, settings=settings
+            )
+        )
         n_prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
         n_completion_tokens: int = self._completion_status.get(
             completion_id, _encode(self.tokenizer, generated_text).shape[1]
@@ -317,14 +454,13 @@ def generate_chat_completion_with_streaming(
         messages: List["APIChatMessage"],
         settings: "TextGenerationSettings",
     ) -> Iterator["ChatCompletionChunk"]:
-        assert self.config is not None and self.tokenizer is not None
         completion_id: str = settings.completion_id
         prompt = self.convert_messages_into_prompt(messages, settings=settings)
         model_path: str = str(self.config.model_path)
         last_token: Optional[str] = None
         generated_text: str = ""
-        for token in self._generate_text_with_streaming(
-            prompt, settings=settings
+        for token in _generate_text_with_streaming(
+            self, prompt=prompt, settings=settings
         ):
             generated_text += token
             if last_token is not None:
@@ -341,7 +477,8 @@ def generate_chat_completion_with_streaming(
             content=last_token if last_token is not None else "",
             finish_reason="length"
             if self._completion_status.get(
-                completion_id, _encode(self.tokenizer, generated_text).shape[1]
+                completion_id,
+                _encode(self.tokenizer, generated_text).shape[1],
             )
             else "stop",
         )
@@ -351,10 +488,13 @@ def generate_chat_completion(
         messages: List["APIChatMessage"],
         settings: "TextGenerationSettings",
     ) -> "ChatCompletion":
-        assert self.tokenizer is not None and self.config is not None
         completion_id: str = settings.completion_id
         prompt = self.convert_messages_into_prompt(messages, settings=settings)
-        generated_text: str = self._generate_text(prompt, settings=settings)
+        generated_text: str = "".join(
+            _generate_text_with_streaming(
+                self, prompt=prompt, settings=settings
+            )
+        )
         prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
         completion_tokens: int = self._completion_status.get(
             completion_id, _encode(self.tokenizer, generated_text).shape[1]
@@ -370,10 +510,62 @@ def generate_chat_completion(
             else "stop",
         )
 
-    def encode(self, message: str, /) -> List[int]:
-        assert self.tokenizer is not None, "Tokenizer is not initialized"
-        return _encode(self.tokenizer, message).flatten().tolist()
+    def encode(self, text: str) -> List[int]:
+        assert self._tokenizer is not None, "Tokenizer is not initialized"
+        return _encode(self._tokenizer, text).flatten().tolist()
 
-    def decode(self, tokens: List[int], /) -> str:
-        assert self.tokenizer is not None, "Tokenizer is not initialized"
-        return str(self.tokenizer.decode(IntTensor(tokens)))
+    def decode(self, ids: List[int], **kwargs) -> str:
+        assert self._tokenizer is not None, "Tokenizer is not initialized"
+        return str(self._tokenizer.decode(IntTensor(ids)))
+
+    def __del__(self) -> None:
+        if self._model is not None:
+            self._model.free_unmanaged()
+            del self._model
+            self._model = None
+            logger.info("🗑️ ExllamaCompletionGenerator model deleted")
+        if self._tokenizer is not None:
+            getattr(self._tokenizer, "__del__", lambda: None)()
+            del self._tokenizer
+            self._tokenizer = None
+            logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted")
+        if self._cache is not None:
+            getattr(self._cache, "__del__", lambda: None)()
+            del self._cache
+            self._cache = None
+            logger.info("🗑️ ExllamaCompletionGenerator cache deleted")
+
+
+@overload
+def _encode(
+    tokenizer: ExLlamaTokenizer,
+    text: str,
+    return_mask: bool = False,
+) -> Tensor:
+    ...
+
+
+@overload
+def _encode(
+    tokenizer: ExLlamaTokenizer,
+    text: List[str],
+    return_mask: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    ...
+
+
+def _encode(
+    tokenizer: ExLlamaTokenizer,
+    text: Union[str, List[str]],
+    return_mask: bool = False,
+) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+    """Encode a text string into a tensor."""
+    result = tokenizer.encode(text, return_mask=return_mask)
+    if return_mask:
+        ids, mask = result
+        assert isinstance(ids, Tensor) and isinstance(mask, Tensor)
+        return ids, mask
+    else:
+        ids = result[0] if isinstance(result, tuple) else result
+        assert isinstance(ids, Tensor)
+        return ids
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index 3aa49b6..77a2df1 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -1,6 +1,12 @@
 """Wrapper for llama_cpp to generate text completions."""
 from inspect import signature
-from typing import Dict, Iterator, List, Literal, Optional, Union
+from typing import (  # noqa: F401
+    Callable,
+    Iterator,
+    List,
+    Optional,
+    Union,
+)
 
 from ..schemas.api import (
     APIChatMessage,
@@ -29,46 +35,23 @@
     from repositories.llama_cpp import llama_cpp
 
 
-def _make_logit_bias_processor(
-    llama: llama_cpp.Llama,
-    logit_bias: Dict[str, float],
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]],
-):
-    """Create a logit bias processor to bias the logit scores."""
-    if logit_bias_type is None:
-        logit_bias_type = "input_ids"
-
-    to_bias: Dict[int, float] = {}
-    if logit_bias_type == "input_ids":
-        for input_id_string, score in logit_bias.items():
-            to_bias[int(input_id_string)] = score
-
-    elif logit_bias_type == "tokens":
-        for token, score in logit_bias.items():
-            for input_id in llama.tokenize(
-                token.encode("utf-8"), add_bos=False
-            ):
-                to_bias[input_id] = score
-
-    def logit_bias_processor(
-        input_ids: List[int],
-        scores: List[float],
-    ) -> List[float]:
-        new_scores: List[float] = [0.0] * len(scores)
-        for input_id, score in enumerate(scores):
-            new_scores[input_id] = score + to_bias.get(input_id, 0.0)
-
-        return new_scores
-
-    return logit_bias_processor
-
-
 def _create_completion(
     client: llama_cpp.Llama,
     prompt: str,
     stream: bool,
     settings: TextGenerationSettings,
 ) -> Union[Completion, Iterator[CompletionChunk]]:
+    logit_processors = llama_cpp.LogitsProcessorList(
+        [
+            processor.without_torch
+            for processor in BaseCompletionGenerator.get_logit_processors(
+                settings=settings,
+                encoder=lambda s: client.tokenize(
+                    s.encode("utf-8"), add_bos=False
+                ),
+            )
+        ]
+    )
     return client.create_completion(  # type: ignore
         stream=stream,
         prompt=prompt,
@@ -85,17 +68,7 @@ def _create_completion(
         mirostat_mode=settings.mirostat_mode,
         mirostat_tau=settings.mirostat_tau,
         mirostat_eta=settings.mirostat_eta,
-        logits_processor=llama_cpp.LogitsProcessorList(
-            [
-                _make_logit_bias_processor(
-                    client,
-                    settings.logit_bias,
-                    settings.logit_bias_type,
-                ),
-            ]
-        )
-        if settings.logit_bias is not None
-        else None,
+        logits_processor=logit_processors if logit_processors else None,
         stop=settings.stop,
     )
 
@@ -109,40 +82,16 @@ def _create_chat_completion(
     prompt: str = LlamaCppCompletionGenerator.convert_messages_into_prompt(
         messages, settings=settings
     )
-    completion_or_chunks = client(
-        prompt=prompt,
-        temperature=settings.temperature,
-        top_p=settings.top_p,
-        top_k=settings.top_k,
-        stream=stream,
-        max_tokens=settings.max_tokens,
-        repeat_penalty=settings.repeat_penalty,
-        presence_penalty=settings.presence_penalty,
-        frequency_penalty=settings.frequency_penalty,
-        tfs_z=settings.tfs_z,
-        mirostat_mode=settings.mirostat_mode,
-        mirostat_tau=settings.mirostat_tau,
-        mirostat_eta=settings.mirostat_eta,
-        logits_processor=llama_cpp.LogitsProcessorList(
-            [
-                _make_logit_bias_processor(
-                    client,
-                    settings.logit_bias,
-                    settings.logit_bias_type,
-                ),
-            ]
-        )
-        if settings.logit_bias is not None
-        else None,
-        stop=settings.stop,
+    completion_or_chunks = _create_completion(
+        client=client, prompt=prompt, stream=stream, settings=settings
     )
     if isinstance(completion_or_chunks, Iterator):
         return convert_text_completion_chunks_to_chat(
-            completion_or_chunks,  # type: ignore
+            completion_or_chunks,
         )
     else:
         return convert_text_completion_to_chat(
-            completion_or_chunks,  # type: ignore
+            completion_or_chunks,
         )
 
 
@@ -294,12 +243,12 @@ def generate_chat_completion_with_streaming(
                 return  # the generator was interrupted
             yield chunk
 
-    def encode(self, text: str, add_bos: bool = True) -> List[int]:
+    def encode(self, text: str, add_bos: bool = True, **kwargs) -> List[int]:
         assert self.client is not None, "Client is not initialized"
         return self.client.tokenize(
             text.encode("utf-8", errors="ignore"), add_bos=add_bos
         )
 
-    def decode(self, tokens: List[int]) -> str:
+    def decode(self, ids: List[int], **kwargs) -> str:
         assert self.client is not None, "Client is not initialized"
-        return self.client.detokenize(tokens).decode("utf-8", errors="ignore")
+        return self.client.detokenize(ids).decode("utf-8", errors="ignore")
diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py
index 45a08e3..93f8dab 100644
--- a/llama_api/schemas/api.py
+++ b/llama_api/schemas/api.py
@@ -46,7 +46,7 @@ class APIChatMessage(BaseModel):
     )
 
     class Config:
-        from_attributes = True
+        frozen = True
 
 
 class TextGenerationSettings(BaseModel):
@@ -114,7 +114,6 @@ class TextGenerationSettings(BaseModel):
             "t so far, decreasing the model's likelihood to repeat the same line verbatim."
         ),
     )
-
     presence_penalty: float = Field(
         default=0.0,
         ge=-2.0,
@@ -136,6 +135,13 @@ class TextGenerationSettings(BaseModel):
             "9) will be more lenient."
         ),
     )
+    repetition_penalty_range: int = Field(
+        default=0,
+        ge=0,
+        description=(
+            "The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used."
+        ),
+    )
     top_k: int = Field(
         default=40,
         ge=0,
@@ -195,6 +201,24 @@ class TextGenerationSettings(BaseModel):
         default=False,
         description="If True, the EOS token is banned from being generated.",
     )
+    muse: bool = Field(
+        default=False,
+        description="Use Muse logit processor (experimental). "
+        "Muse logit processor performs dampening of the k highest probability elements.",
+    )
+    guidance_scale: float = Field(
+        default=1.0,
+        ge=1.0,
+        description="The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`. "
+        "Higher guidance scale encourages the model to generate samples that are more closely linked to the input "
+        "prompt, usually at the expense of poorer quality",
+    )
+    negative_prompt: Optional[str] = Field(
+        default=None,
+        description="The negative prompt for classifier free guidance (CFG). "
+        "The negative prompt is used to encourage the model not to generate samples that are too similar to the "
+        "negative prompt. CFG is enabled by setting `guidance_scale > 1`.",
+    )
 
 
 class CreateEmbeddingRequest(BaseModel):
@@ -243,6 +267,12 @@ class CreateChatCompletionRequest(TextGenerationSettings):
     stream: bool = Field(
         default=False, description="Whether to stream the response."
     )
+    functions: Optional[FunctionProperty] = Field(
+        default=None, description="The functions to invoke."
+    )
+    function_call: Optional[
+        Union[FunctionProperty, Literal["auto", "none"]]
+    ] = Field(default=None, description="The function call to invoke.")
 
     class Config:
         json_schema_extra = {
diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py
index 4704834..4fa59f5 100644
--- a/llama_api/schemas/models.py
+++ b/llama_api/schemas/models.py
@@ -101,6 +101,14 @@ class ExllamaModel(BaseLLMModel):
             "window size from 2048 to 4096, set this to 2.0."
         },
     )
+    alpha_value: Optional[float] = field(
+        default=None,
+        metadata={
+            "description": "Positional embeddings alpha factor for "
+            "NTK RoPE scaling. Use either this or compress_pos_emb, "
+            "not both at the same time."
+        },
+    )
     gpu_peer_fix: bool = field(
         default=False,
         metadata={
diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
index 75f3a2b..eca2af2 100644
--- a/llama_api/server/pools/llama.py
+++ b/llama_api/server/pools/llama.py
@@ -94,8 +94,6 @@ def get_completion_generator(
         )
         if body.model in openai_replacement_models:
             body.model = openai_replacement_models[body.model]
-            if not isinstance(body, CreateEmbeddingRequest):
-                body.logit_bias = None
 
         # Check if the model is defined in LLMModels enum
         llm_model = get_model(body.model)
@@ -111,17 +109,13 @@ def get_completion_generator(
         # Before creating new one, deallocate embeddings to free up memory
         if embedding_generators:
             free_memory_of_first_item_from_container(
-                embedding_generators,
-                min_free_memory_mb=512,
-                logger=logger,
+                embedding_generators, logger=logger
             )
 
         # Before creating a new completion generator, check memory usage
         if completion_generators.maxlen == len(completion_generators):
             free_memory_of_first_item_from_container(
-                completion_generators,
-                min_free_memory_mb=256,
-                logger=logger,
+                completion_generators, logger=logger
             )
 
         # Create a new completion generator
@@ -167,16 +161,12 @@ def get_embedding_generator(
         # Before creating a new completion generator, check memory usage
         if embedding_generators.maxlen == len(embedding_generators):
             free_memory_of_first_item_from_container(
-                embedding_generators,
-                min_free_memory_mb=256,
-                logger=logger,
+                embedding_generators, logger=logger
             )
         # Before creating a new, deallocate embeddings to free up memory
         if completion_generators:
             free_memory_of_first_item_from_container(
-                completion_generators,
-                min_free_memory_mb=512,
-                logger=logger,
+                completion_generators, logger=logger
             )
 
         if "sentence" in body.model and "encoder" in body.model:
diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index f1580b3..f36a328 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -167,7 +167,7 @@ def error_message_wrapper(
         return 500, ErrorResponse(
             message=str(error),
             type="internal_server_error",
-            param=f"traceback:: {parse_trackback(error)}",
+            param=f"traceback:: {parse_traceback(error)}",
             code=type(error).__name__,
         )
 
@@ -255,7 +255,7 @@ async def custom_route_handler(self, request: Request) -> Response:
             )
 
 
-def parse_trackback(exception: Exception) -> str:
+def parse_traceback(exception: Exception) -> str:
     """Parses traceback information from the exception"""
     if (
         exception.__traceback__ is not None
diff --git a/llama_api/utils/process_pool.py b/llama_api/utils/process_pool.py
index ecaed7a..c3358d4 100644
--- a/llama_api/utils/process_pool.py
+++ b/llama_api/utils/process_pool.py
@@ -9,6 +9,7 @@
 from multiprocessing import Process, Queue, cpu_count
 from threading import Thread
 from time import sleep
+from traceback import format_exception
 from types import TracebackType
 from typing import (
     Any,
@@ -22,11 +23,15 @@
     Union,
 )
 
+from llama_api.utils.logger import ApiLogger
+
 if sys.version_info >= (3, 10):
     from typing import ParamSpec
 else:
     from typing_extensions import ParamSpec
 
+logger = ApiLogger(__name__)
+
 
 class _WrappedWorkerException(Exception):  # type: ignore
     def __init__(
@@ -146,6 +151,9 @@ def _worker_job_loop(
         except Exception as e:
             # If it fails, we need to send the exception back
             error = _WrappedWorkerException(str(e), e.__class__.__name__)
+            logger.error(
+                "".join(format_exception(e.__class__, e, e.__traceback__))
+            )
             result = None
         try:
             # We're using pickle to serialize the result
@@ -261,6 +269,9 @@ def result(self) -> Optional[Tuple[Any, Exception]]:
                 unwrapped_err = err.exception
                 unwrapped_err.__traceback__ = err.traceback
                 err = unwrapped_err
+                logger.error(
+                    f"Error in worker process: {err.__class__.__name__}: {err}"
+                )
             return ret, err
         except queue.Empty:
             if not self.process.is_alive():
diff --git a/llama_api/utils/system.py b/llama_api/utils/system.py
index f2f4410..78ca571 100644
--- a/llama_api/utils/system.py
+++ b/llama_api/utils/system.py
@@ -162,7 +162,7 @@ def empty_cache():
                 logger.warning(
                     (
                         f"RAM + VRAM usage did not decrease "
-                        "by at least {min_free_memory_mb} MB "
+                        f"by at least {min_free_memory_mb} MB "
                         "after removing the oldest object.\n"
                         "This may indicate a memory leak.\n"
                         f"- Memory usage before: {mem_usage_before} MB\n"

From 9a726b4a9eae1dd1b81d454a3b0e970d54da9af9 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 13 Aug 2023 11:39:57 +0900
Subject: [PATCH 02/15] Added xformers

---
 llama_api/modules/base.py                     |   6 +-
 llama_api/modules/exllama.py                  |  34 +++--
 llama_api/modules/llama_cpp.py                |  46 ++----
 llama_api/modules/xformers.py                 | 133 ++++++++++++++++++
 llama_api/schemas/api.py                      |   2 +-
 .../{function_calling.py => function_call.py} |   0
 llama_api/schemas/models.py                   |  23 ++-
 llama_api/server/app_settings.py              |   9 +-
 llama_api/server/routers/v1.py                |   2 +-
 llama_api/utils/concurrency.py                |  11 +-
 llama_api/utils/dependency.py                 |   4 +-
 llama_api/utils/errors.py                     |   2 +-
 main.py                                       |  19 ++-
 13 files changed, 222 insertions(+), 69 deletions(-)
 create mode 100644 llama_api/modules/xformers.py
 rename llama_api/schemas/{function_calling.py => function_call.py} (100%)

diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
index 6bd6286..09b8291 100644
--- a/llama_api/modules/base.py
+++ b/llama_api/modules/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from typing import Any, Iterator, List, TypeVar
 
 from llama_api.mixins.logits import LogitsMixin
@@ -25,6 +25,10 @@ class BaseLLMModel:
     model_path: str = "/path/to/model"
     max_total_tokens: int = 2048
 
+    @property
+    def asdict(self) -> dict:
+        return asdict(self)
+
 
 class BaseCompletionGenerator(
     ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin
diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index 7fb0a0f..293682d 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -1,4 +1,19 @@
 """Wrapper for exllama to generate text completions."""
+# flake8: noqa
+from os import environ
+
+from ..utils.logger import ApiLogger
+
+logger = ApiLogger(__name__)
+if environ.get("LLAMA_API_XFORMERS") == "1":
+    try:
+        from ..modules.xformers import hijack_attention_forward
+
+        hijack_attention_forward()
+    except Exception as e:
+        logger.warning(
+            f"xformers mode is enabled, but xformers is not installed: {e}"
+        )
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -24,8 +39,6 @@
     make_completion_chunk,
 )
 from ..utils.dependency import import_repository
-from ..utils.logger import ApiLogger
-from ..utils.path import resolve_model_path_to_posix
 from ..utils.system import deallocate_memory
 from .base import BaseCompletionGenerator
 
@@ -48,7 +61,7 @@
     )
 
 assert cuda.is_available(), "CUDA must be available to use ExLlama."
-logger = ApiLogger(__name__)
+
 _stop_checker = BaseCompletionGenerator.is_possible_to_generate_stops
 
 
@@ -126,7 +139,7 @@ def _apply_settings_to_generator(
 ) -> ExLlamaGenerator:
     """Apply the settings to the generator."""
     # Make sure that the batch size is correct
-    required_batch_size = 1 if settings.guidance_scale == 1 else 2
+    required_batch_size = 1 if settings.guidance_scale <= 1 else 2
     cache_batch_size = cg.cache.batch_size  # type: int
     if cache_batch_size != required_batch_size:
         cg._cache = None
@@ -159,7 +172,9 @@ def _gen_single_token_with_cfg(
     generator: ExLlamaGenerator, mask: Tensor, cfg_alpha: float
 ) -> int:
     logits = generator.model.forward(
-        generator.sequence[:, -1:], cache=generator.cache, input_mask=mask
+        generator.sequence[:, -1:],
+        cache=generator.cache,
+        input_mask=mask,
     )  # type: Tensor  # type: ignore
     generator.apply_rep_penalty(logits)
     probs = log_softmax(logits, dim=-1)
@@ -183,7 +198,7 @@ def _gen_single_token_without_cfg(
     if generator.sequence is not None:
         logits = generator.model.forward(
             generator.sequence[:, -1:],
-            generator.cache,
+            cache=generator.cache,
             lora=generator.lora,
             input_mask=mask,
         )  # type: Tensor  # type: ignore
@@ -375,12 +390,7 @@ def from_pretrained(
         cls, llm_model: "ExllamaModel"
     ) -> "ExllamaCompletionGenerator":
         result = cls()
-        model_folder_path = Path(
-            resolve_model_path_to_posix(
-                llm_model.model_path,
-                default_relative_directory="models/gptq",
-            )
-        )
+        model_folder_path = Path(llm_model.model_path_resolved)
         result._config = _make_config(model_folder_path, llm_model)
         result._tokenizer = ExLlamaTokenizer(
             (model_folder_path / "tokenizer.model").as_posix()
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index 77a2df1..10501b9 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -1,6 +1,7 @@
 """Wrapper for llama_cpp to generate text completions."""
 from inspect import signature
 from typing import (  # noqa: F401
+    TYPE_CHECKING,
     Callable,
     Iterator,
     List,
@@ -23,7 +24,6 @@
 )
 from ..utils.dependency import import_repository
 from ..utils.logger import ApiLogger
-from ..utils.path import resolve_model_path_to_posix
 from .base import BaseCompletionGenerator
 
 logger = ApiLogger(__name__)
@@ -122,40 +122,16 @@ def llm_model(self) -> "LlamaCppModel":
     def from_pretrained(
         cls, llm_model: "LlamaCppModel"
     ) -> "LlamaCppCompletionGenerator":
-        additional_kwargs = {}
-        arg_keys = signature(llama_cpp.Llama.__init__).parameters.keys()
-        if "rope_freq_base" in arg_keys:
-            additional_kwargs.update(
-                {"rope_freq_base": llm_model.rope_freq_base},
-            )
-        if "rope_freq_scale" in arg_keys:
-            additional_kwargs.update(
-                {"rope_freq_scale": llm_model.rope_freq_scale}
-            )
-        client = llama_cpp.Llama(
-            model_path=resolve_model_path_to_posix(
-                llm_model.model_path,
-                default_relative_directory="models/ggml",
-            ),
-            n_ctx=llm_model.max_total_tokens,
-            n_parts=llm_model.n_parts,
-            n_gpu_layers=llm_model.n_gpu_layers,
-            seed=llm_model.seed,
-            f16_kv=llm_model.f16_kv,
-            logits_all=llm_model.logits_all,
-            vocab_only=llm_model.vocab_only,
-            use_mmap=llm_model.use_mmap,
-            use_mlock=llm_model.use_mlock,
-            embedding=llm_model.embedding,
-            n_threads=llm_model.n_threads,
-            n_batch=llm_model.n_batch,
-            last_n_tokens_size=llm_model.last_n_tokens_size,
-            lora_base=llm_model.lora_base,
-            lora_path=llm_model.lora_path,
-            low_vram=llm_model.low_vram,
-            verbose=llm_model.echo,
-            **additional_kwargs,
-        )
+        kwargs = {
+            # Get all attributes of llm_model
+            key: value
+            for key, value in llm_model.asdict.items()
+            # Hacky way to pass arguments to older versions of llama-cpp-python
+            if key in signature(llama_cpp.Llama.__init__).parameters.keys()
+        }
+        kwargs["model_path"] = llm_model.model_path_resolved
+        kwargs["verbose"] = llm_model.verbose and llm_model.echo
+        client = llama_cpp.Llama(**kwargs)
         if llm_model.cache:
             cache_type = llm_model.cache_type
             if cache_type is None:
diff --git a/llama_api/modules/xformers.py b/llama_api/modules/xformers.py
new file mode 100644
index 0000000..16d5695
--- /dev/null
+++ b/llama_api/modules/xformers.py
@@ -0,0 +1,133 @@
+# flake8: noqa
+import math
+from typing import TYPE_CHECKING, Optional, Tuple
+
+import torch
+import transformers.models.llama.modeling_llama
+from xformers.ops import memory_efficient_attention, LowerTriangularMask
+from torch import Tensor, cat, finfo, float32, matmul, softmax, tensor
+
+from ..utils.logger import ApiLogger
+
+if TYPE_CHECKING:
+    from transformers.models.llama.modeling_llama import LlamaAttention
+
+
+logger = ApiLogger(__name__)
+
+
+def hijack_attention_forward():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = _forward
+    logger.info(f"Replaced attention forward with {__name__.split('.')[-1]}")
+
+
+def _forward(
+    self: "LlamaAttention",
+    hidden_states: Tensor,
+    attention_mask: Optional[Tensor] = None,
+    position_ids: Optional[Tensor] = None,
+    past_key_value: Optional[Tuple[Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
+    # COPY: oobabooga/text-generation-webui/modules/llama_attn_hijack.py
+    logger.info(f"Using {__name__.split('.')[-1]}")
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = cat([past_key_value[0], key_states], dim=2)
+        value_states = cat([past_key_value[1], value_states], dim=2)  # type: ignore
+
+    past_key_value = (key_states, value_states) if use_cache else None  # type: ignore
+
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=LowerTriangularMask(),
+            )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, tensor(finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = softmax(attn_weights, dim=-1, dtype=float32).to(
+            query_states.dtype
+        )
+        attn_output = matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+    return (
+        self.o_proj(attn_output.reshape(bsz, q_len, self.hidden_size)),
+        attn_weights,
+        past_key_value,
+    )
diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py
index 93f8dab..543c76c 100644
--- a/llama_api/schemas/api.py
+++ b/llama_api/schemas/api.py
@@ -8,7 +8,7 @@
 from pydantic.main import BaseModel
 from typing_extensions import TypedDict
 
-from .function_calling import FunctionProperty
+from .function_call import FunctionProperty
 
 # If python version >= 3.11, use the built-in NotRequired type.
 # Otherwise, import it from typing_extensi
diff --git a/llama_api/schemas/function_calling.py b/llama_api/schemas/function_call.py
similarity index 100%
rename from llama_api/schemas/function_calling.py
rename to llama_api/schemas/function_call.py
diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py
index 4fa59f5..9e4027d 100644
--- a/llama_api/schemas/models.py
+++ b/llama_api/schemas/models.py
@@ -1,6 +1,9 @@
 from dataclasses import dataclass, field
+from functools import cached_property
 from typing import List, Literal, Optional
 
+from llama_api.utils.path import resolve_model_path_to_posix
+
 from ..modules.base import BaseLLMModel
 
 
@@ -64,7 +67,8 @@ class LlamaCppModel(BaseLLMModel):
     cache: bool = (
         False  # The size of the cache in bytes. Only used if cache is True.
     )
-    echo: bool = True  # Whether to echo the prompt.
+    verbose: bool = True  # Whether to echo the prompt.
+    echo: bool = True  # Compatibility of verbose.
     lora_base: Optional[str] = None  # The path to the Llama LoRA base model.
     lora_path: Optional[
         str
@@ -86,6 +90,16 @@ class LlamaCppModel(BaseLLMModel):
     # Refer: https://github.com/ggerganov/llama.cpp/pull/2054
     rope_freq_base: float = 10000.0  # I use 26000 for n_ctx=4096.
     rope_freq_scale: float = 1.0  # Generally, 2048 / n_ctx.
+    n_gqa: Optional[int] = None  # TEMPORARY: Set to 8 for Llama2 70B
+    rms_norm_eps: Optional[float] = None  # TEMPORARY
+    mul_mat_q: Optional[bool] = None  # TEMPORARY
+
+    @cached_property
+    def model_path_resolved(self):
+        return resolve_model_path_to_posix(
+            self.model_path,
+            default_relative_directory="models/ggml",
+        )
 
 
 @dataclass
@@ -136,3 +150,10 @@ class ExllamaModel(BaseLLMModel):
     matmul_no_half2: bool = False
     silu_no_half2: bool = False
     concurrent_streams: bool = False
+
+    @cached_property
+    def model_path_resolved(self):
+        return resolve_model_path_to_posix(
+            self.model_path,
+            default_relative_directory="models/gptq",
+        )
diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index bd472da..9c30a7f 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -105,6 +105,8 @@ def initialize_before_launch(
 
         # Get current packages installed
         logger.info(f"📦 Installed packages: {get_installed_packages()}")
+    if environ.get("LLAMA_API_XFORMERS") == "1":
+        install_package("xformers")
     else:
         logger.warning(
             "🏃‍♂️ Skipping package installation... "
@@ -149,13 +151,12 @@ async def health():
 
 def run(
     port: int,
-    max_workers: int = 1,
     install_packages: bool = False,
     force_cuda: bool = False,
     skip_pytorch_install: bool = False,
     skip_tensorflow_install: bool = False,
     skip_compile: bool = False,
-    api_key: Optional[str] = None,
+    environs: Optional[Dict[str, str]] = None,
 ) -> None:
     initialize_before_launch(
         git_and_disk_paths=Config.git_and_disk_paths,
@@ -169,8 +170,8 @@ def run(
     from uvicorn import Config as UvicornConfig
     from uvicorn import Server as UvicornServer
 
-    environ["MAX_WORKERS"] = str(max_workers)
-    environ["API_KEY"] = api_key or ""
+    if environs:
+        environ.update(environs)
 
     UvicornServer(
         config=UvicornConfig(
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 6b9715f..9e8b7f7 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -87,7 +87,7 @@ class WixMetadata:
 # processing a request. This is used to prevent multiple requests from
 # creating multiple completion generators at the same time.
 wixs: Tuple[WixMetadata] = tuple(
-    WixMetadata() for _ in range(int(environ.get("MAX_WORKERS", 1)))
+    WixMetadata() for _ in range(int(environ.get("LLAMA_API_MAX_WORKERS", 1)))
 )
 
 
diff --git a/llama_api/utils/concurrency.py b/llama_api/utils/concurrency.py
index 1b5d877..797a1be 100644
--- a/llama_api/utils/concurrency.py
+++ b/llama_api/utils/concurrency.py
@@ -36,13 +36,6 @@ def init_process_pool(env_vars: Dict[str, str]) -> None:
     for key, value in env_vars.items():
         environ[key] = value
 
-    cuda_home = environ.get("CUDA_HOME", None)
-    cuda_path = environ.get("CUDA_PATH", None)
-    if cuda_path is not None and cuda_home is None:
-        environ["CUDA_HOME"] = cuda_path
-    elif cuda_home is not None and cuda_path is None:
-        environ["CUDA_PATH"] = cuda_home
-
 
 def pool() -> ProcessPool:
     """Get the process pool, and initialize it if it's not initialized yet"""
@@ -51,14 +44,14 @@ def pool() -> ProcessPool:
     if _pool is None:
         logger.info("Initializing process pool...")
         _pool = ProcessPool(
-            max_workers=int(environ.get("MAX_WORKERS", 1)),
+            max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)),
             initializer=init_process_pool,
             initargs=(dict(environ),),
         )
     elif not _pool.is_available:
         logger.critical("🚨 Process pool died. Reinitializing process pool...")
         _pool = ProcessPool(
-            max_workers=int(environ.get("MAX_WORKERS", 1)),
+            max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)),
             initializer=init_process_pool,
             initargs=(dict(environ),),
         )
diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py
index a0cb6c3..8dbee87 100644
--- a/llama_api/utils/dependency.py
+++ b/llama_api/utils/dependency.py
@@ -218,12 +218,12 @@ def import_repository(git_path: str, disk_path: str):
     sys.path.remove(str(disk_path))
 
 
-def install_package(package: str, force: bool = False) -> bool:
+def install_package(package: str, *args, force: bool = False) -> bool:
     """Install a package with pip."""
     if not force and is_package_available(package):
         return True
     return run_command(
-        [sys.executable, "-m", "pip", "install", package],
+        [sys.executable, "-m", "pip", "install", package, *args],
         action="install",
         name=package,
     )
diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index f36a328..dc7f4e2 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -126,7 +126,7 @@ class RouteErrorHandler(APIRoute):
         ): ErrorResponseFormatters.model_not_found,
     }
 
-    api_key: Optional[str] = environ.get("API_KEY", None) or None
+    api_key: Optional[str] = environ.get("LLAMA_API_API_KEY") or None
 
     @cached_property
     def authorization(self) -> Optional[str]:
diff --git a/main.py b/main.py
index 74a0f1e..9b8d7f7 100644
--- a/main.py
+++ b/main.py
@@ -41,6 +41,11 @@
         action="store_true",
         help="Skip installing tensorflow, if `install-pkgs` is set",
     )
+    parser.add_argument(
+        "--skip-compile",
+        action="store_true",
+        help="Skip compiling the shared library of LLaMA C++ code",
+    )
     parser.add_argument(
         "-k",
         "--api-key",
@@ -48,14 +53,24 @@
         default=None,
         help="API key to use for the server",
     )
+    parser.add_argument(
+        "-x",
+        "--xformers",
+        action="store_true",
+        help="Apply xformers' memory-efficient optimizations",
+    )
 
     args = parser.parse_args()
     run(
         port=args.port,
-        max_workers=args.max_workers,
         install_packages=args.install_pkgs,
         force_cuda=args.force_cuda,
         skip_pytorch_install=args.skip_torch_install,
         skip_tensorflow_install=args.skip_tf_install,
-        api_key=args.api_key,
+        skip_compile=args.skip_compile,
+        environs={
+            "LLAMA_API_MAX_WORKERS": str(args.max_workers),
+            "LLAMA_API_XFORMERS": "1" if args.xformers else "0",
+            "LLAMA_API_API_KEY": args.api_key or "",
+        },
     )

From cfc18bf56e443e875dc629cc1df13d0e64bdc87d Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 13 Aug 2023 14:41:46 +0900
Subject: [PATCH 03/15] Test suite refactor

---
 pyproject.toml       |   2 +-
 requirements.txt     |  22 ++--
 tests/conftest.py    | 176 +++++++++++++++++++++++++-
 tests/test_server.py | 289 ++++++++++---------------------------------
 4 files changed, 249 insertions(+), 240 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a88a9f5..2d80eb7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ python = ">=3.8.1,<3.12"
 poetry = "^1.5.1"
 
 uvicorn = { extras = ["standard"], version = "^0.23" }
-fastapi = "^0.100"
+fastapi = "^0.100.1"
 orjson = "^3.9"
 sse-starlette = "^1.6"
 psutil = "^5.9"
diff --git a/requirements.txt b/requirements.txt
index 75b8ca9..ac45196 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ cffi==1.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" and (s
 charset-normalizer==3.2.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cleo==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 click==8.1.6 ; python_full_version >= "3.8.1" and python_version < "3.12"
-cmake==3.27.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+cmake==3.27.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 colorama==0.4.6 ; python_full_version >= "3.8.1" and python_version < "3.12" and (os_name == "nt" or platform_system == "Windows")
 crashtest==0.4.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cryptography==41.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux"
@@ -25,19 +25,19 @@ httptools==0.6.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 huggingface-hub==0.16.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
 idna==3.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
 importlib-metadata==6.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
-importlib-resources==6.0.0 ; python_full_version >= "3.8.1" and python_version < "3.9"
+importlib-resources==6.0.1 ; python_full_version >= "3.8.1" and python_version < "3.9"
 installer==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 jaraco-classes==3.3.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 jeepney==0.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux"
 jsonschema-specifications==2023.7.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-jsonschema==4.18.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
+jsonschema==4.19.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 keyring==23.13.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 lockfile==0.12.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
-more-itertools==10.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+more-itertools==10.1.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 msgpack==1.0.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 ninja==1.11.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 numpy==1.24.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
-orjson==3.9.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
+orjson==3.9.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
 packaging==23.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pexpect==4.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pkginfo==1.9.6 ; python_full_version >= "3.8.1" and python_version < "3.12"
@@ -46,7 +46,7 @@ platformdirs==3.10.0 ; python_full_version >= "3.8.1" and python_version < "3.12
 poetry-core==1.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 poetry-plugin-export==1.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 poetry==1.5.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-protobuf==4.23.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
+protobuf==4.24.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 psutil==5.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 ptyprocess==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pycparser==2.21 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux")
@@ -58,12 +58,12 @@ python-dotenv==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12
 pywin32-ctypes==0.2.2 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "win32"
 pyyaml==6.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 rapidfuzz==2.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-referencing==0.30.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
-regex==2023.6.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
+referencing==0.30.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
+regex==2023.8.8 ; python_full_version >= "3.8.1" and python_version < "3.12"
 requests-toolbelt==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 requests==2.31.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 rpds-py==0.9.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
-safetensors==0.3.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
+safetensors==0.3.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 secretstorage==3.3.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux"
 sentencepiece==0.1.99 ; python_full_version >= "3.8.1" and python_version < "3.12"
 shellingham==1.5.0.post1 ; python_full_version >= "3.8.1" and python_version < "3.12"
@@ -75,9 +75,9 @@ tensorflow-hub==0.14.0 ; python_full_version >= "3.8.1" and python_version < "3.
 tokenizers==0.13.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
 tomli==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.11"
 tomlkit==0.12.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-tqdm==4.65.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+tqdm==4.66.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 transformers==4.31.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
-trove-classifiers==2023.7.6 ; python_full_version >= "3.8.1" and python_version < "3.12"
+trove-classifiers==2023.8.7 ; python_full_version >= "3.8.1" and python_version < "3.12"
 typing-extensions==4.7.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 urllib3==1.26.16 ; python_full_version >= "3.8.1" and python_version < "3.12"
 uvicorn[standard]==0.23.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
diff --git a/tests/conftest.py b/tests/conftest.py
index e276030..e96096e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,21 +1,49 @@
+from asyncio import gather
+from datetime import datetime
 import importlib
 import unittest
 from os import environ
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Type  # noqa: F401
+from re import compile, sub
+from typing import (
+    TYPE_CHECKING,
+    AsyncIterator,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from orjson import loads
+from llama_api.schemas.api import (
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    CompletionChoice,
+    CompletionChunk,
+    ModelList,
+)
 
 from llama_api.server.app_settings import create_app_llama_cpp
 from llama_api.shared.config import Config
-from llama_api.utils.dependency import install_package, is_package_available
 from llama_api.utils.concurrency import _pool
+from llama_api.utils.dependency import install_package, is_package_available
+from llama_api.utils.system import get_cuda_version
 
 if TYPE_CHECKING:
+    from typing import Type  # noqa: F401
+
     from fastapi.testclient import TestClient  # noqa: F401
-    from httpx import AsyncClient  # noqa: F401
+    from httpx import AsyncClient, Response  # noqa: F401
+
+
+EndPoint = Literal["completions", "chat/completions"]
 
 
 class TestLlamaAPI(unittest.TestCase):
-    ggml_model: str = "orca-mini-3b.ggmlv3.q4_1.bin"
+    ggml_model: str = "orca-mini-3b.ggmlv3.q4_0.bin"
     ggml_path: Path = Config.project_root / Path(f"models/ggml/{ggml_model}")
 
     gptq_model: str = "orca_mini_7b"
@@ -37,9 +65,147 @@ def setUpClass(cls):
             "fastapi.testclient"
         ).TestClient  # type: Type[TestClient]
         cls.app = create_app_llama_cpp()
-        environ["MAX_WORKERS"] = "2"
+        environ["LLAMA_API_MAX_WORKERS"] = "2"
 
     @classmethod
     def tearDownClass(cls):
         if _pool is not None:
             _pool.shutdown(wait=True)
+
+    @property
+    def check_ggml(self) -> None:
+        if not self.ggml_path.exists():
+            self.skipTest(f"No model in {self.ggml_path}")
+
+    @property
+    def check_gptq(self) -> None:
+        if not self.gptq_path.exists():
+            self.skipTest(f"No model in {self.gptq_path}")
+
+    @property
+    def check_cuda(self) -> None:
+        if not get_cuda_version():
+            self.skipTest("CUDA is not available")
+
+    async def arequest_completion(
+        self,
+        model_names: Union[List[str], Tuple[str, ...]],
+        endpoints: Union[EndPoint, Iterable[EndPoint]],
+    ) -> Tuple[List[List[str]], List[datetime], List[datetime]]:
+        async with self.AsyncClient(
+            app=self.app, base_url="http://localhost", timeout=None
+        ) as client:
+            # Get models using the API
+            models = await self.get_models(
+                client=client, model_names=list(model_names)
+            )  # type: List[str]
+
+            # Submit requests to the API and get responses
+            return await self.submit_streaming_requests(
+                client=client,
+                model_and_endpoints=zip(
+                    models,
+                    (
+                        [endpoints] * len(model_names)  # type: ignore
+                        if isinstance(endpoints, str)
+                        else endpoints
+                    ),
+                ),
+            )
+
+    async def get_models(
+        self, client: "AsyncClient", model_names: List[str]
+    ) -> List[str]:
+        # Get models using the API
+        model_resp: ModelList = (await client.get("/v1/models")).json()
+        models: List[str] = []
+        for model_name in model_names:
+            model: Optional[str] = None
+            for model_data in model_resp["data"]:
+                if model_name in model_data["id"]:
+                    model = sub(r"\(.*\)", "", model_data["id"]).strip()
+                    break
+            self.assertTrue(model, f"Model {model_name} not found")
+            models.append(str(model))
+        return models
+
+    async def submit_streaming_requests(
+        self,
+        client: "AsyncClient",
+        model_and_endpoints: Iterable[Tuple[str, EndPoint]],
+    ) -> Tuple[List[List[str]], List[datetime], List[datetime]]:
+        async def send_request(
+            model: str, endpoint: EndPoint
+        ) -> Tuple[List[str], datetime, datetime]:
+            async with client.stream(
+                method="POST",
+                url=f"/v1/{endpoint}",
+                json=self.union(
+                    {"model": model, "max_tokens": 50},
+                    {"stream": True},
+                    {"messages": self.messages}
+                    if endpoint.startswith("chat")
+                    else {"prompt": self.prompt},
+                ),
+                headers={"Content-Type": "application/json"},
+            ) as response:
+                response.raise_for_status()
+                start_at = datetime.now()
+                results = []  # type: List[str]
+                async for chunk in self.extract_json_from_streaming_response(
+                    response
+                ):
+                    self.assertIn("choices", chunk, "No choices in response")
+                    choice = chunk["choices"][0]
+                    if "delta" in choice and choice["delta"].get("content"):
+                        results.append(choice["delta"]["content"])
+                    elif "text" in choice:
+                        results.append(choice["text"])
+            self.assertGreaterEqual(len(results), 1, "No result in response")
+            return results, start_at, datetime.now()
+
+        tasks = [
+            send_request(model, endpoint)
+            for model, endpoint in model_and_endpoints
+        ]
+        return tuple(zip(*await gather(*tasks)))  # type: ignore
+
+    def harvest_results(
+        self, models: List[str], responses: List["Response"]
+    ) -> List[str]:
+        results: List[str] = []
+        for model, response in zip(models, responses):
+            self.assertEqual(response.status_code, 200)
+            choice: Union[
+                CompletionChoice, ChatCompletionChoice
+            ] = response.json()["choices"][0]
+            if "message" in choice:
+                results.append(choice["message"]["content"])
+            elif "text" in choice:
+                results.append(choice["text"])
+            else:
+                raise ValueError(f"Unknown response: {response.json()}")
+            print(f"Result of {model}:", results[-1], end="\n\n", flush=True)
+        self.assertEqual(len(results), len(models))
+        return results
+
+    async def extract_json_from_streaming_response(
+        self,
+        response: "Response",
+    ) -> AsyncIterator[Union[CompletionChunk, ChatCompletionChunk]]:
+        """Extract json from streaming `httpx.Response`"""
+        regex_finder = compile(rb"data:\s*({.+?})\s*\r?\n\s*\r?\n").finditer
+        bytes_buffer = bytearray()
+        async for stream in response.aiter_bytes():
+            bytes_buffer.extend(stream)
+            for match in regex_finder(bytes_buffer):
+                try:
+                    json_data = loads(match.group(1))
+                    yield json_data
+                    bytes_buffer.clear()
+                except Exception:
+                    continue
+
+    @staticmethod
+    def union(*dicts: Dict) -> Dict:
+        return {k: v for d in dicts for k, v in d.items()}
diff --git a/tests/test_server.py b/tests/test_server.py
index 8255b4a..f44c132 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -1,32 +1,11 @@
-import re
-from asyncio import gather
-from typing import (
-    Awaitable,
-    Dict,
-    Iterable,
-    List,
-    Literal,
-    Optional,
-    Tuple,
-    Union,
-)
+# flake8: noqa
 import unittest
-from llama_api.utils.system import get_cuda_version
-from tests.conftest import TestLlamaAPI
-
-from llama_api.schemas.api import (
-    ModelList,
-    ChatCompletionChoice,
-    CompletionChoice,
-)
-
 
-EndPoint = Literal["completions", "chat/completions"]
+from tests.conftest import TestLlamaAPI
 
 
-class TestServer(TestLlamaAPI, unittest.IsolatedAsyncioTestCase):
-    """Test the FastAPI server
-    with basic health checks & LLM completion requests (with concurrency)"""
+class TestServerBasic(TestLlamaAPI):
+    """Test the FastAPI server with basic health checks"""
 
     def test_health(self):
         """Test the health endpoint"""
@@ -37,239 +16,103 @@ def test_health(self):
             )
             self.assertEqual(response.status_code, 200)
 
+    def test_v1_models(self):
+        """Test the v1/models endpoint"""
+        with self.TestClient(app=self.app) as client:
+            response = client.get(
+                "/v1/models",
+                headers={"Content-Type": "application/json"},
+            )
+            self.assertEqual(response.status_code, 200)
+
     def test_import_llama_cpp(self):
         try:
-            from llama_api.modules.llama_cpp import (  # noqa: F401
-                LlamaCppCompletionGenerator,
+            from llama_api.modules.llama_cpp import (
+                LlamaCppCompletionGenerator,  # noqa: F401
             )
         except ImportError as e:
             self.fail(f"Failed to import module: {e}")
 
-    @unittest.skipIf(
-        get_cuda_version() is None,
-        reason="No CUDA found on this system",
-    )
     def test_import_exllama(self):
+        self.check_cuda
         try:
-            from llama_api.modules.exllama import (  # noqa: F401
-                ExllamaCompletionGenerator,
+            from llama_api.modules.exllama import (
+                ExllamaCompletionGenerator,  # noqa: F401
             )
         except ImportError as e:
             self.fail(f"Failed to import module: {e}")
 
     def test_import_sentence_encoder(self):
         try:
-            from llama_api.modules.sentence_encoder import (  # noqa: F401
-                SentenceEncoderEmbeddingGenerator,
+            from llama_api.modules.sentence_encoder import (
+                SentenceEncoderEmbeddingGenerator,  # noqa: F401
             )
         except ImportError as e:
             self.fail(f"Failed to import module: {e}")
 
     def test_import_transformer(self):
         try:
-            from llama_api.modules.transformer import (  # noqa: F401
-                TransformerEmbeddingGenerator,
-            )
+            from llama_api.modules.transformer import (
+                TransformerEmbeddingGenerator,  # noqa: F401
+            )  #
         except ImportError as e:
             self.fail(f"Failed to import module: {e}")
 
-    def test_v1_models(self):
-        """Test the v1/models endpoint"""
-        with self.TestClient(app=self.app) as client:
-            response = client.get(
-                "/v1/models",
-                headers={"Content-Type": "application/json"},
-            )
-            self.assertEqual(response.status_code, 200)
 
-    @unittest.skipIf(
-        not TestLlamaAPI.ggml_path.exists(),
-        reason=f"No model in {TestLlamaAPI.ggml_path}",
-    )
-    def test_llama_cpp(self):
+class TestServerAdvanced(TestLlamaAPI, unittest.IsolatedAsyncioTestCase):
+    """Test the FastAPI server with advanced completion tests"""
+
+    async def test_llama_cpp(self):
         """Test the Llama CPP model completion endpoints"""
-        self._request_completion(
-            model_names=(self.ggml_model,), endpoints="completions"
+        self.check_ggml
+        model_names = (self.ggml_model, self.ggml_model)
+        responses, starts, ends = await self.arequest_completion(
+            model_names=model_names,
+            endpoints=("chat/completions", "completions"),
         )
-        self._request_completion(
-            model_names=(self.ggml_model,), endpoints="chat/completions"
+        start_1, end_1 = starts[0], ends[0]
+        print(f"GGML response: {''.join(responses[0])}", flush=True)
+        start_2, end_2 = starts[1], ends[1]
+        print(f"GGML response: {''.join(responses[1])}", flush=True)
+
+        self.assertTrue(
+            end_1 < start_2 or end_2 < start_1,
+            f"Synchronous completion failed: {end_1} < {start_2} and {end_2} < {start_1}",
         )
 
-    @unittest.skipIf(
-        not TestLlamaAPI.gptq_path.exists(),
-        reason=f"No model in{TestLlamaAPI.gptq_path}",
-    )
-    def test_exllama(self):
+    async def test_exllama(self):
         """Test the ExLLama model completion endpoints"""
-        self._request_completion(
-            model_names=(self.gptq_model,), endpoints="completions"
-        )
-        self._request_completion(
-            model_names=(self.gptq_model,), endpoints="chat/completions"
-        )
-
-    @unittest.skipIf(
-        not TestLlamaAPI.ggml_path.exists(),
-        reason=f"No model in {TestLlamaAPI.ggml_path}",
-    )
-    async def test_llama_cpp_concurrency(self):
-        """Test the Llama CPP model completion endpoints with concurrency"""
-        model_names: Tuple[str, ...] = (self.ggml_model, self.ggml_model)
-        await self._arequest_completion(
-            model_names=model_names, endpoints="completions"
+        self.check_gptq
+        model_names = (self.gptq_model, self.gptq_model)
+        responses, starts, ends = await self.arequest_completion(
+            model_names=model_names,
+            endpoints=("chat/completions", "completions"),
         )
-
-    @unittest.skipIf(
-        not TestLlamaAPI.gptq_path.exists(),
-        reason=f"No model in {TestLlamaAPI.gptq_path}",
-    )
-    async def test_exllama_concurrency(self):
-        """Test the ExLLama model completion endpoints with concurrency"""
-        model_names: Tuple[str, ...] = (self.gptq_model, self.gptq_model)
-        await self._arequest_completion(
-            model_names=model_names, endpoints="completions"
+        start_1, end_1 = starts[0], ends[0]
+        print(f"GPTQ response: {''.join(responses[0])}", flush=True)
+        start_2, end_2 = starts[1], ends[1]
+        print(f"GPTQ response: {''.join(responses[1])}", flush=True)
+
+        self.assertTrue(
+            end_1 < start_2 or end_2 < start_1,
+            f"Synchronous completion failed: {end_1} < {start_2} and {end_2} < {start_1}",
         )
 
-    @unittest.skipIf(
-        (not TestLlamaAPI.ggml_path.exists())
-        or (not TestLlamaAPI.gptq_path.exists()),
-        f"No model in {TestLlamaAPI.ggml_path} or {TestLlamaAPI.gptq_path}",
-    )
     async def test_llama_mixed_concurrency(self):
         """Test the Llama CPP & ExLLama model completion endpoints
         with concurrency"""
-        model_names: Tuple[str, ...] = (self.ggml_model, self.gptq_model)
-        await self._arequest_completion(
+        self.check_ggml
+        self.check_gptq
+        model_names = (self.ggml_model, self.gptq_model)
+        responses, starts, ends = await self.arequest_completion(
             model_names=model_names, endpoints="completions"
         )
-
-    async def _arequest_completion(
-        self,
-        model_names: Union[List[str], Tuple[str, ...]],
-        endpoints: Union[EndPoint, Iterable[EndPoint]],
-    ):
-        _endpoints: Iterable[str] = (
-            [endpoints] * len(model_names)
-            if isinstance(endpoints, str)
-            else endpoints
-        )
-        async with self.AsyncClient(
-            app=self.app, base_url="http://localhost", timeout=None
-        ) as client:
-            # Get models using the API
-            model_resp: ModelList = (await client.get("/v1/models")).json()
-            models: List[str] = []
-            for model_name in model_names:
-                model: Optional[str] = None
-                for model_data in model_resp["data"]:
-                    if model_name in model_data["id"]:
-                        model = re.sub(r"\(.*\)", "", model_data["id"]).strip()
-                        break
-                self.assertTrue(model, f"Model {model_name} not found")
-                models.append(str(model))
-
-            # Submit requests to the API
-            tasks: List[Awaitable] = []
-            for model, endpoint in zip(models, _endpoints):
-                request = {"model": model, "max_tokens": 50}
-                request_message = (
-                    {"messages": self.messages}
-                    if endpoint.startswith("chat")
-                    else {"prompt": self.prompt}
-                )
-                tasks.append(
-                    client.post(
-                        f"/v1/{endpoint}",
-                        json=_union(
-                            request, {"stream": False}, request_message
-                        ),
-                        headers={"Content-Type": "application/json"},
-                        timeout=None,
-                    )
-                )
-
-            # Wait for responses
-            cmpl_resps: List = await gather(*tasks)
-            results: List[str] = []
-            for model, cmpl_resp in zip(models, cmpl_resps):
-                self.assertEqual(cmpl_resp.status_code, 200)
-                choice: Union[
-                    CompletionChoice, ChatCompletionChoice
-                ] = cmpl_resp.json()["choices"][0]
-                if "message" in choice:
-                    results.append(choice["message"]["content"])
-                elif "text" in choice:
-                    results.append(choice["text"])
-                else:
-                    raise ValueError(f"Unknown response: {cmpl_resp.json()}")
-                print(
-                    f"Result of {model}:", results[-1], end="\n\n", flush=True
-                )
-
-        self.assertEqual(len(results), len(models))
-
-    def _request_completion(
-        self,
-        model_names: Union[List[str], Tuple[str, ...]],
-        endpoints: Union[EndPoint, Iterable[EndPoint]],
-    ):
-        _endpoints: Iterable[str] = (
-            [endpoints] * len(model_names)
-            if isinstance(endpoints, str)
-            else endpoints
+        start_1, end_1 = starts[0], ends[0]
+        print(f"GGML response: {''.join(responses[0])}", flush=True)
+        start_2, end_2 = starts[1], ends[1]
+        print(f"GPTQ response: {''.join(responses[1])}", flush=True)
+
+        self.assertTrue(
+            start_2 < end_1 or start_1 < end_2,
+            f"Asynchronous completion failed: {start_1} < {end_2} and {start_2} < {end_1}",
         )
-        with self.TestClient(app=self.app) as client:
-            # Get models using the API
-            model_resp = (client.get("/v1/models")).json()
-            models: List[str] = []
-            for model_name in model_names:
-                model: Optional[str] = None
-                for model_data in model_resp["data"]:
-                    if model_name in model_data["id"]:
-                        model = re.sub(r"\(.*\)", "", model_data["id"]).strip()
-                        break
-                self.assertTrue(model, f"Model {model_name} not found")
-                models.append(str(model))
-
-            # Submit requests to the API
-            cmpl_resps: List = []
-            for model, endpoint in zip(models, _endpoints):
-                request = {"model": model, "max_tokens": 50}
-                request_message = (
-                    {"messages": self.messages}
-                    if endpoint.startswith("chat")
-                    else {"prompt": self.prompt}
-                )
-                cmpl_resps.append(
-                    client.post(
-                        f"/v1/{endpoint}",
-                        json=_union(
-                            request, {"stream": False}, request_message
-                        ),
-                        headers={"Content-Type": "application/json"},
-                        timeout=None,
-                    )
-                )
-
-            # Wait for responses
-            results: List[str] = []
-            for model, cmpl_resp in zip(models, cmpl_resps):
-                self.assertEqual(cmpl_resp.status_code, 200)
-                choice: Union[
-                    CompletionChoice, ChatCompletionChoice
-                ] = cmpl_resp.json()["choices"][0]
-                if "message" in choice:
-                    results.append(choice["message"]["content"])
-                elif "text" in choice:
-                    results.append(choice["text"])
-                else:
-                    raise ValueError(f"Unknown response: {cmpl_resp.json()}")
-                print(
-                    f"Result of {model}:", results[-1], end="\n\n", flush=True
-                )
-
-        self.assertEqual(len(results), len(models))
-
-
-def _union(*dicts: Dict) -> Dict:
-    return {k: v for d in dicts for k, v in d.items()}

From 72d21f43d484053435f7f66d66f4db1dec9ca45b Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 13 Aug 2023 15:48:00 +0900
Subject: [PATCH 04/15] Added persistent docker compose file

---
 docker-compose.persistent.yml | 47 +++++++++++++++++++++++++++++++++++
 docker-compose.yml            |  5 ++--
 2 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 docker-compose.persistent.yml

diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
new file mode 100644
index 0000000..4b0d767
--- /dev/null
+++ b/docker-compose.persistent.yml
@@ -0,0 +1,47 @@
+version: '3.8'
+
+volumes:
+  llama-api-models:
+
+services:
+  llama-api:
+    image: cosogi/llama-api:latest
+    entrypoint: ["python3", "-m", "main", "--port", "8000"]
+    environment:
+      - LLAMA_API_MAX_WORKERS=1
+      - LLAMA_API_API_KEY=
+    volumes:
+      - llama-api-models:/app/models
+      - ./model_definitions.py:/app/model_definitions.py
+      - ./main.py:/app/main.py
+    ports:
+      - 8000:8000
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+
+
+# services:
+#   llama-api:
+#     build:
+#       context: .
+#       dockerfile: Dockerfile
+#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
+#     environment:
+#       - LLAMA_API_MAX_WORKERS=1
+#       - LLAMA_API_API_KEY=
+#     volumes:
+#       - llama-api-models:/app/models
+#       - ./model_definitions.py:/app/model_definitions.py
+#       - ./main.py:/app/main.py
+#     ports:
+#       - 8000:8000
+#     deploy:
+#       resources:
+#         reservations:
+#           devices:
+#           - driver: nvidia
+#             capabilities: [gpu]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index c87a432..d96c5d0 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,10 +2,11 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230730
+    image: cosogi/llama-api:latest
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
-      - MAX_WORKERS=1
+      - LLAMA_API_MAX_WORKERS=1
+      - LLAMA_API_API_KEY=
     volumes:
       - ./models:/app/models
       - ./llama_api:/app/llama_api

From 05f6108d8be177f32030fe91443eaa76f82e14f9 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 13 Aug 2023 17:08:05 +0900
Subject: [PATCH 05/15] Support caching model path

---
 .gitignore                  |  1 +
 llama_api/schemas/models.py |  7 ++--
 llama_api/utils/path.py     | 73 +++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2b48ff4..4836eb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ repositories/
 .venv/
 .vscode/
 .test-venv/
+.temp/
 PRIVATE_*
\ No newline at end of file
diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py
index 9e4027d..e622438 100644
--- a/llama_api/schemas/models.py
+++ b/llama_api/schemas/models.py
@@ -2,9 +2,8 @@
 from functools import cached_property
 from typing import List, Literal, Optional
 
-from llama_api.utils.path import resolve_model_path_to_posix
-
 from ..modules.base import BaseLLMModel
+from ..utils.path import path_resolver
 
 
 @dataclass
@@ -96,7 +95,7 @@ class LlamaCppModel(BaseLLMModel):
 
     @cached_property
     def model_path_resolved(self):
-        return resolve_model_path_to_posix(
+        return path_resolver(
             self.model_path,
             default_relative_directory="models/ggml",
         )
@@ -153,7 +152,7 @@ class ExllamaModel(BaseLLMModel):
 
     @cached_property
     def model_path_resolved(self):
-        return resolve_model_path_to_posix(
+        return path_resolver(
             self.model_path,
             default_relative_directory="models/gptq",
         )
diff --git a/llama_api/utils/path.py b/llama_api/utils/path.py
index 880e99e..84f6f9d 100644
--- a/llama_api/utils/path.py
+++ b/llama_api/utils/path.py
@@ -1,7 +1,9 @@
+import orjson
 from pathlib import Path
 from re import compile
 from typing import List, Literal, Optional
 
+
 from ..shared.config import Config
 from ..utils.huggingface_downloader import (
     Classification,
@@ -9,6 +11,7 @@
 )
 from ..utils.logger import ApiLogger
 
+
 logger = ApiLogger(__name__)
 
 
@@ -181,8 +184,78 @@ def resolve_model_path_to_posix(
                 logger.info(f"`{path.name}` found in {parent_dir}")
                 return (parent_dir / model_path).resolve().as_posix()
 
+        if model_path.count("/") != 1:
+            raise FileNotFoundError(
+                f"`{model_path}` not found in any of the following "
+                f"directories: {parent_dir_candidates}"
+            )
         # Try to resolve the model path from Huggingface
         return HuggingfaceResolver(model_path).resolve()
     except Exception as e:
         logger.error(f"Error resolving model path: {e}")
         raise e
+
+
+def resolve_model_path_to_posix_with_cache(
+    model_path: str,
+    default_relative_directory: Optional[str] = None,
+) -> str:
+    """Resolve a model path to a POSIX path, with caching."""
+    from filelock import FileLock, Timeout
+
+    cache_file = Path(".temp/model_paths.json")
+    cache_file.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        with FileLock(
+            cache_file.with_suffix(".lock"), timeout=10
+        ):  # Set a timeout if necessary
+            # Read the cache
+            try:
+                with open(cache_file, "r") as f:
+                    cache = orjson.loads(f.read())
+                    assert isinstance(cache, dict)
+            except Exception:
+                cache = {}
+
+            resolved = cache.get(model_path)
+            if not (isinstance(resolved, str) or resolved is None):
+                raise TypeError(
+                    f"Invalid cache entry for model path `{model_path}`: "
+                    f"{resolved}"
+                )
+            if not resolved:
+                resolved = resolve_model_path_to_posix(
+                    model_path, default_relative_directory
+                )
+                cache[model_path] = resolved
+
+                # Update the cache file
+                try:
+                    with open(cache_file, "w") as f:
+                        f.write(orjson.dumps(cache).decode())
+                except Exception as e:
+                    logger.error(f"Error writing model path cache: {e}")
+            return resolved
+    except (Timeout, TypeError) as e:
+        logger.warning(
+            "Error acquiring lock for model path cache"
+            + str(cache_file.with_suffix(".lock"))
+            + f": {e}"
+        )
+        return resolve_model_path_to_posix(
+            model_path, default_relative_directory
+        )
+
+
+def path_resolver(
+    model_path: str, default_relative_directory: Optional[str] = None
+) -> str:
+    """Resolve a model path to a POSIX path, with caching if possible."""
+    try:
+        return resolve_model_path_to_posix_with_cache(
+            model_path, default_relative_directory
+        )
+    except ImportError:
+        return resolve_model_path_to_posix(
+            model_path, default_relative_directory
+        )

From 681bfae8d31b584a4ad7b3e73219fab514e9a41c Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Mon, 14 Aug 2023 01:41:11 +0900
Subject: [PATCH 06/15] Fixed CUDA docker image build error

---
 Dockerfile                       | 14 +++++-----
 docker-compose.persistent.yml    |  3 ++-
 docker-compose.yml               | 44 ++++++++++++++++----------------
 llama_api/modules/llama_cpp.py   |  7 ++---
 llama_api/server/app_settings.py | 20 +++------------
 llama_api/shared/config.py       |  6 +----
 llama_api/utils/dependency.py    | 14 +++++++---
 llama_api/utils/llama_cpp.py     | 37 ++++++++++++++++-----------
 main.py                          |  3 ++-
 9 files changed, 74 insertions(+), 74 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 85ba958..7757736 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,17 +2,16 @@
 ### Approximately 5 ~ 10 minutes to build
 
 # Select the required CUDA version.
-ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
-FROM nvidia/cuda:${CUDA_IMAGE}
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 ENV PYTHON_VERSION="3.11.4"
 ENV PYTHON_VERSION_SHORT="3.11"
-ENV HOST 0.0.0.0
-ENV PORT=8000
 
 # Copy the necessary files.
-COPY requirements.txt /app/requirements.txt
-COPY pyproject.toml /app/pyproject.toml
 COPY llama_api /app/llama_api
+COPY pyproject.toml /app/pyproject.toml
+COPY requirements.txt /app/requirements.txt
+COPY main.py /app/main.py
+COPY model_definitions.py /app/model_definitions.py
 
 # Install the necessary applications, and then install Python.
 # Then, install the necessary Python packages(Dependencies).
@@ -41,7 +40,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /tmp/* \
     && cd /app \
-    && python3 -m llama_api.server.app_settings --force-cuda --install-pkgs
+    && python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
+    # Need to skip complie because GPU access to host is not supported when building image.
 
 # Set the working directory and start the server.
 WORKDIR /app
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
index 4b0d767..971d8de 100644
--- a/docker-compose.persistent.yml
+++ b/docker-compose.persistent.yml
@@ -5,9 +5,10 @@ volumes:
 
 services:
   llama-api:
-    image: cosogi/llama-api:latest
+    image: cosogi/llama-api:230813
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
+      - FORCE_CUDA=1
       - LLAMA_API_MAX_WORKERS=1
       - LLAMA_API_API_KEY=
     volumes:
diff --git a/docker-compose.yml b/docker-compose.yml
index d96c5d0..0ad51af 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:latest
+    image: cosogi/llama-api:230813
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - LLAMA_API_MAX_WORKERS=1
@@ -24,24 +24,24 @@ services:
             capabilities: [gpu]
 
 # services:
-  # llama-api:
-  #   build:
-  #     context: .
-  #     dockerfile: Dockerfile
-  #   entrypoint: ["python3", "-m", "main", "--port", "8000"]
-  #   environment:
-  #     - MAX_WORKERS=1
-  #   volumes:
-  #     - ./models:/app/models
-  #     - ./llama_api:/app/llama_api
-  #     - ./model_definitions.py:/app/model_definitions.py
-  #     - ./main.py:/app/main.py
-  #     - ./requirements.txt:/app/requirements.txt
-  #   ports:
-  #     - 8000:8000
-  #   deploy:
-  #     resources:
-  #       reservations:
-  #         devices:
-  #         - driver: nvidia
-  #           capabilities: [gpu]
\ No newline at end of file
+#   llama-api:
+#     build:
+#       context: .
+#       dockerfile: Dockerfile
+#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
+#     environment:
+#       - MAX_WORKERS=1
+#     volumes:
+#       - ./models:/app/models
+#       - ./llama_api:/app/llama_api
+#       - ./model_definitions.py:/app/model_definitions.py
+#       - ./main.py:/app/main.py
+#       - ./requirements.txt:/app/requirements.txt
+#     ports:
+#       - 8000:8000
+#     deploy:
+#       resources:
+#         reservations:
+#           devices:
+#           - driver: nvidia
+#             capabilities: [gpu]
\ No newline at end of file
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index 10501b9..7de518d 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -1,8 +1,6 @@
 """Wrapper for llama_cpp to generate text completions."""
 from inspect import signature
-from typing import (  # noqa: F401
-    TYPE_CHECKING,
-    Callable,
+from typing import (
     Iterator,
     List,
     Optional,
@@ -23,14 +21,17 @@
     convert_text_completion_to_chat,
 )
 from ..utils.dependency import import_repository
+from ..utils.llama_cpp import build_shared_lib
 from ..utils.logger import ApiLogger
 from .base import BaseCompletionGenerator
 
 logger = ApiLogger(__name__)
 logger.info("🦙 llama-cpp-python repository found!")
+build_shared_lib(logger=logger)
 with import_repository(
     git_path="https://github.com/abetlen/llama-cpp-python",
     disk_path="repositories/llama_cpp",
+    options=["--recurse-submodules"],
 ):
     from repositories.llama_cpp import llama_cpp
 
diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 9c30a7f..27abd75 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -3,13 +3,11 @@
 from contextlib import asynccontextmanager
 from os import environ, getpid
 from pathlib import Path
-from typing import Dict, Literal, Optional, Union
+from typing import Dict, Literal, Optional
 
-from ..shared.config import Config
 from ..utils.dependency import (
     get_installed_packages,
     get_poetry_executable,
-    git_clone,
     install_all_dependencies,
     install_package,
     install_pytorch,
@@ -67,7 +65,6 @@ def set_priority(
 
 
 def initialize_before_launch(
-    git_and_disk_paths: Optional[Dict[str, Union[str, Path]]] = None,
     install_packages: bool = False,
     force_cuda: bool = False,
     skip_pytorch_install: bool = False,
@@ -75,14 +72,11 @@ def initialize_before_launch(
     skip_compile: bool = False,
 ) -> None:
     """Initialize the app"""
-
-    # Git clone the repositories
-    if git_and_disk_paths is not None:
-        for git_path, disk_path in git_and_disk_paths.items():
-            git_clone(git_path=git_path, disk_path=disk_path)
-
     if install_packages:
         # Install all dependencies
+        if not skip_compile:
+            # Build the shared library of LLaMA C++ code
+            build_shared_lib(logger=logger, force_cuda=force_cuda)
         poetry = get_poetry_executable()
         if not poetry.exists():
             # Install poetry
@@ -99,10 +93,6 @@ def initialize_before_launch(
         project_paths = [Path(".")] + list(Path("repositories").glob("*"))
         install_all_dependencies(project_paths=project_paths)
 
-        if not skip_compile:
-            # Build the shared library of LLaMA C++ code
-            build_shared_lib(logger=logger)
-
         # Get current packages installed
         logger.info(f"📦 Installed packages: {get_installed_packages()}")
     if environ.get("LLAMA_API_XFORMERS") == "1":
@@ -159,7 +149,6 @@ def run(
     environs: Optional[Dict[str, str]] = None,
 ) -> None:
     initialize_before_launch(
-        git_and_disk_paths=Config.git_and_disk_paths,
         install_packages=install_packages,
         force_cuda=force_cuda,
         skip_pytorch_install=skip_pytorch_install,
@@ -217,7 +206,6 @@ def run(
     args = parser.parse_args()
 
     initialize_before_launch(
-        git_and_disk_paths=Config.git_and_disk_paths,
         install_packages=args.install_pkgs,
         force_cuda=args.force_cuda,
         skip_pytorch_install=args.skip_torch_install,
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index 45c7d47..4ecd592 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import List, Tuple
 
 
 class Config:
@@ -11,10 +11,6 @@ class Config:
     torch_version: str = "==2.0.1"
     torch_source: str = "https://download.pytorch.org/whl/torch_stable.html"
     tensorflow_version: str = "==2.13.0"
-    git_and_disk_paths: Dict[str, Union[Path, str]] = {
-        "https://github.com/abetlen/llama-cpp-python": "repositories/llama_cpp",  # noqa: E501
-        "https://github.com/turboderp/exllama": "repositories/exllama",
-    }
     ggml_quanitzation_preferences_order: List[str] = [
         "q4_K_M",
         "q4_K_S",
diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py
index 8dbee87..9fc4dd4 100644
--- a/llama_api/utils/dependency.py
+++ b/llama_api/utils/dependency.py
@@ -60,11 +60,15 @@ def is_package_available(package: str) -> bool:
     return True if find_spec(package) else False
 
 
-def git_clone(git_path: str, disk_path: Union[Path, str]) -> Optional[bool]:
+def git_clone(
+    git_path: str,
+    disk_path: Union[Path, str],
+    options: Optional[List[str]] = None,
+) -> Optional[bool]:
     """Clone a git repository to a disk path."""
     if not Path(disk_path).exists():
         return run_command(
-            ["git", "clone", git_path, str(disk_path)],
+            ["git", "clone", git_path, str(disk_path), *(options or [])],
             action="clone",
             name=f"{git_path} to {disk_path}",
             try_emoji="📥",
@@ -203,14 +207,16 @@ def convert_toml_to_requirements_with_poetry(
 
 
 @contextmanager
-def import_repository(git_path: str, disk_path: str):
+def import_repository(
+    git_path: str, disk_path: str, options: Optional[List[str]] = None
+):
     """
     Import a repository from git. The repository will be cloned to disk_path.
     The dependencies will be installed from pyproject.toml or requirements.txt.
     """
 
     # Clone the repository
-    git_clone(git_path=git_path, disk_path=disk_path)
+    git_clone(git_path=git_path, disk_path=disk_path, options=options)
 
     # Add the repository to the path so that it can be imported
     sys.path.insert(0, str(disk_path))
diff --git a/llama_api/utils/llama_cpp.py b/llama_api/utils/llama_cpp.py
index 75eb0c2..fc16ef2 100644
--- a/llama_api/utils/llama_cpp.py
+++ b/llama_api/utils/llama_cpp.py
@@ -13,10 +13,15 @@
 # You can set the CMAKE_ARGS environment variable to change the cmake args.
 # cuBLAS is default to ON if CUDA is installed.
 # CPU inference is default if CUDA is not installed.
-if get_cuda_version() is None:
-    CMAKE_ARGS: str = "-DBUILD_SHARED_LIBS=ON"
+METAL_ARGS = "-DBUILD_SHARED_LIBS=ON -DLLAMA_METAL=ON"
+CUBLAS_ARGS = "-DBUILD_SHARED_LIBS=ON -DLLAMA_CUBLAS=ON"
+CPU_ARGS = "-DBUILD_SHARED_LIBS=ON"
+if sys.platform == "darwin":
+    CMAKE_ARGS: str = METAL_ARGS
+elif get_cuda_version() is None:
+    CMAKE_ARGS: str = CPU_ARGS
 else:
-    CMAKE_ARGS = "-DBUILD_SHARED_LIBS=ON -DLLAMA_CUBLAS=ON"
+    CMAKE_ARGS: str = CUBLAS_ARGS
 
 LIB_BASE_NAME: str = "llama"
 REPOSITORY_FOLDER: str = "repositories"
@@ -60,7 +65,7 @@ def _temporary_change_cwd(path):
         chdir(prev_cwd)
 
 
-def _git_clone() -> None:
+def _git_clone_if_not_exists() -> None:
     # Clone the git repos if they don't exist
     for clone_path, clone_command in GIT_CLONES.items():
         if not clone_path.exists() or not any(clone_path.iterdir()):
@@ -131,6 +136,10 @@ def _cmake_args_to_make_args(cmake_args: List[str]) -> List[str]:
         # capitalize all letters
         cmake_arg = cmake_arg.upper()
 
+        # skip the `BUILD_SHARED_LIBS` flag
+        if "BUILD_SHARED_LIBS" in cmake_arg:
+            continue
+
         # replace `ON` with `1` and `OFF` with `0`
         cmake_arg = cmake_arg.replace("=ON", "=1").replace("=OFF", "=0")
 
@@ -147,15 +156,9 @@ def _make(make_dir: Path, make_args: List[str], target_dir: Path) -> None:
     # Run make to build the shared lib
 
     # Build the shared lib
-    run_command(
-        ["make", *make_args],
-        action="build",
-        name="llama.cpp shared lib",
-        cwd=make_dir,
-    )
     for lib in _get_libs():
         run_command(
-            ["make", lib],
+            ["make", *make_args, lib],
             action="build",
             name="llama.cpp shared lib",
             cwd=make_dir,
@@ -199,20 +202,24 @@ def _cmake(cmake_dir: Path, cmake_args: List[str], target_dir: Path) -> None:
 
 
 def build_shared_lib(
-    logger: Optional[Logger] = None,
-    force_cmake: bool = bool(environ.get("FORCE_CMAKE", False)),
+    logger: Optional[Logger] = None, force_cuda: bool = False
 ) -> None:
     """Build the shared library for llama.cpp"""
 
+    global CMAKE_ARGS
+    if force_cuda or bool(environ.get("FORCE_CUDA", False)):
+        assert get_cuda_version() is not None, "CUDA is not available"
+        CMAKE_ARGS = CUBLAS_ARGS
+
     if logger is None:
         logger = getLogger(__name__)
         logger.setLevel("INFO")
 
     # Git clone llama-cpp-python and llama.cpp
-    _git_clone()
+    _git_clone_if_not_exists()
 
     # Build the libs if they don't exist or if `force_cmake` is True
-    if force_cmake or not any(
+    if bool(environ.get("FORCE_CMAKE", False)) or not any(
         lib_path.exists() for lib_path in _get_lib_paths(MODULE_PATH)
     ):
         # Build the libs
diff --git a/main.py b/main.py
index 9b8d7f7..4215d49 100644
--- a/main.py
+++ b/main.py
@@ -70,7 +70,8 @@
         skip_compile=args.skip_compile,
         environs={
             "LLAMA_API_MAX_WORKERS": str(args.max_workers),
-            "LLAMA_API_XFORMERS": "1" if args.xformers else "0",
+            "LLAMA_API_XFORMERS": "1" if args.xformers else "",
             "LLAMA_API_API_KEY": args.api_key or "",
+            "FORCE_CUDA": "1" if args.force_cuda else "",
         },
     )

From 0775d11677c4de1daccc2465c8994e651bf1b63d Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Mon, 14 Aug 2023 23:07:12 +0900
Subject: [PATCH 07/15] Added chat logger

---
 docker-compose.persistent.yml  |  2 +-
 docker-compose.yml             |  3 +-
 llama_api/modules/base.py      | 13 ++++-
 llama_api/modules/exllama.py   | 69 +++++++++++++++-----------
 llama_api/schemas/models.py    |  4 +-
 llama_api/server/routers/v1.py | 89 ++++++++++++++++++++++++++++++++--
 llama_api/utils/logger.py      | 16 +++---
 7 files changed, 152 insertions(+), 44 deletions(-)

diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
index 971d8de..08605d3 100644
--- a/docker-compose.persistent.yml
+++ b/docker-compose.persistent.yml
@@ -5,7 +5,7 @@ volumes:
 
 services:
   llama-api:
-    image: cosogi/llama-api:230813
+    image: cosogi/llama-api:230814
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
diff --git a/docker-compose.yml b/docker-compose.yml
index 0ad51af..a914dfa 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,9 +2,10 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230813
+    image: cosogi/llama-api:230814
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
+      - FORCE_CUDA=1
       - LLAMA_API_MAX_WORKERS=1
       - LLAMA_API_API_KEY=
     volumes:
diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
index 09b8291..61b1e7e 100644
--- a/llama_api/modules/base.py
+++ b/llama_api/modules/base.py
@@ -1,10 +1,10 @@
 from abc import ABC, abstractmethod
 from dataclasses import asdict, dataclass
+from pathlib import Path
 from typing import Any, Iterator, List, TypeVar
 
-from llama_api.mixins.logits import LogitsMixin
-
 from ..mixins.interrupt import InterruptMixin
+from ..mixins.logits import LogitsMixin
 from ..mixins.prompt_utils import PromptUtilsMixin
 from ..schemas.api import (
     APIChatMessage,
@@ -29,6 +29,10 @@ class BaseLLMModel:
     def asdict(self) -> dict:
         return asdict(self)
 
+    @property
+    def model_path_resolved(self) -> str:
+        return self.model_path
+
 
 class BaseCompletionGenerator(
     ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin
@@ -86,6 +90,11 @@ def decode(self, ids: List[int], **kwargs: Any) -> str:
     def llm_model(self) -> "BaseLLMModel":
         """The LLM model used by this generator."""
 
+    @property
+    def model_name(self) -> str:
+        """Identifier for the model used by this generator."""
+        return Path(self.llm_model.model_path_resolved).stem
+
 
 class BaseEmbeddingGenerator(ABC):
     @abstractmethod
diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index 293682d..7d9e960 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -1,5 +1,6 @@
 """Wrapper for exllama to generate text completions."""
 # flake8: noqa
+from gc import collect
 from os import environ
 
 from ..utils.logger import ApiLogger
@@ -28,6 +29,7 @@
 )
 
 from torch import IntTensor, Tensor, cuda, version
+from torch.cuda import empty_cache
 from torch.nn.functional import log_softmax
 
 from ..logits.base import BaseLogitProcessor
@@ -255,7 +257,7 @@ def _generator(
     logit_processors = (
         [
             processor
-            for processor in BaseCompletionGenerator.get_logit_processors(
+            for processor in cg.get_logit_processors(
                 settings=settings,
                 encoder=cg.encode,
             )
@@ -332,12 +334,18 @@ def _generate_text_with_streaming(
                 return_mask=True,
             )
             generator.gen_begin(ids, mask=mask)
+
+        prompt_tokens = ids.shape[-1]
+        context_window = cg.llm_model.max_total_tokens
         cg.raise_for_token_limit(
-            prompt_tokens=ids.shape[-1],
-            context_window=cg.llm_model.max_total_tokens,
+            prompt_tokens=prompt_tokens, context_window=context_window
+        )
+        settings.max_tokens = min(
+            settings.max_tokens, context_window - prompt_tokens
         )
+
         yield from _generator(
-            cg, cfg_mask=mask, settings=settings, stops=stops
+            cg, settings=settings, cfg_mask=mask, stops=stops
         )
     except Exception as e:
         logger.exception(e)
@@ -406,8 +414,8 @@ def from_pretrained(
     def generate_completion_with_streaming(
         self, prompt: str, settings: "TextGenerationSettings"
     ) -> Iterator["CompletionChunk"]:
-        completion_id: str = settings.completion_id
-        model_path: str = str(self.config.model_path)
+        completion_id = settings.completion_id
+        model = self.model_name
         last_token: Optional[str] = None
         generated_text: str = ""
         for token in _generate_text_with_streaming(
@@ -417,14 +425,14 @@ def generate_completion_with_streaming(
             if last_token is not None:
                 yield make_completion_chunk(
                     id=completion_id,
-                    model=model_path,
+                    model=model,
                     text=last_token,
                     finish_reason=None,
                 )
             last_token = token
         yield make_completion_chunk(
             id=completion_id,
-            model=model_path,
+            model=model,
             text=last_token if last_token is not None else "",
             finish_reason="length"
             if self._completion_status.get(
@@ -438,19 +446,19 @@ def generate_completion_with_streaming(
     def generate_completion(
         self, prompt: str, settings: "TextGenerationSettings"
     ) -> "Completion":
-        completion_id: str = settings.completion_id
-        generated_text: str = "".join(
+        completion_id = settings.completion_id
+        generated_text = "".join(
             _generate_text_with_streaming(
                 self, prompt=prompt, settings=settings
             )
         )
-        n_prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
-        n_completion_tokens: int = self._completion_status.get(
+        n_prompt_tokens = _encode(self.tokenizer, prompt).shape[1]
+        n_completion_tokens = self._completion_status.get(
             completion_id, _encode(self.tokenizer, generated_text).shape[1]
         )
         return make_completion(
             id=completion_id,
-            model=str(self.config.model_path),
+            model=self.model_name,
             text=generated_text,
             prompt_tokens=n_prompt_tokens,
             completion_tokens=n_completion_tokens,
@@ -464,9 +472,9 @@ def generate_chat_completion_with_streaming(
         messages: List["APIChatMessage"],
         settings: "TextGenerationSettings",
     ) -> Iterator["ChatCompletionChunk"]:
-        completion_id: str = settings.completion_id
+        completion_id = settings.completion_id
         prompt = self.convert_messages_into_prompt(messages, settings=settings)
-        model_path: str = str(self.config.model_path)
+        model = self.model_name
         last_token: Optional[str] = None
         generated_text: str = ""
         for token in _generate_text_with_streaming(
@@ -476,14 +484,14 @@ def generate_chat_completion_with_streaming(
             if last_token is not None:
                 yield make_chat_completion_chunk(
                     id=completion_id,
-                    model=model_path,
+                    model=model,
                     content=last_token,
                     finish_reason=None,
                 )
             last_token = token
         yield make_chat_completion_chunk(
             id=completion_id,
-            model=model_path,
+            model=model,
             content=last_token if last_token is not None else "",
             finish_reason="length"
             if self._completion_status.get(
@@ -498,20 +506,20 @@ def generate_chat_completion(
         messages: List["APIChatMessage"],
         settings: "TextGenerationSettings",
     ) -> "ChatCompletion":
-        completion_id: str = settings.completion_id
+        completion_id = settings.completion_id
         prompt = self.convert_messages_into_prompt(messages, settings=settings)
-        generated_text: str = "".join(
+        generated_text = "".join(
             _generate_text_with_streaming(
                 self, prompt=prompt, settings=settings
             )
         )
-        prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
-        completion_tokens: int = self._completion_status.get(
+        prompt_tokens = _encode(self.tokenizer, prompt).shape[1]
+        completion_tokens = self._completion_status.get(
             completion_id, _encode(self.tokenizer, generated_text).shape[1]
         )
         return make_chat_completion(
             id=completion_id,
-            model=str(self.config.model_path),
+            model=self.model_name,
             content=generated_text,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
@@ -529,11 +537,6 @@ def decode(self, ids: List[int], **kwargs) -> str:
         return str(self._tokenizer.decode(IntTensor(ids)))
 
     def __del__(self) -> None:
-        if self._model is not None:
-            self._model.free_unmanaged()
-            del self._model
-            self._model = None
-            logger.info("🗑️ ExllamaCompletionGenerator model deleted")
         if self._tokenizer is not None:
             getattr(self._tokenizer, "__del__", lambda: None)()
             del self._tokenizer
@@ -544,6 +547,18 @@ def __del__(self) -> None:
             del self._cache
             self._cache = None
             logger.info("🗑️ ExllamaCompletionGenerator cache deleted")
+        if self._generator is not None:
+            getattr(self._generator, "__del__", lambda: None)()
+            del self._generator
+            self._generator = None
+            logger.info("🗑️ ExllamaCompletionGenerator generator deleted")
+        if self._model is not None:
+            self._model.free_unmanaged()
+            del self._model
+            self._model = None
+            logger.info("🗑️ ExllamaCompletionGenerator model deleted")
+        collect()
+        empty_cache()
 
 
 @overload
diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py
index e622438..2bc1438 100644
--- a/llama_api/schemas/models.py
+++ b/llama_api/schemas/models.py
@@ -94,7 +94,7 @@ class LlamaCppModel(BaseLLMModel):
     mul_mat_q: Optional[bool] = None  # TEMPORARY
 
     @cached_property
-    def model_path_resolved(self):
+    def model_path_resolved(self) -> str:
         return path_resolver(
             self.model_path,
             default_relative_directory="models/ggml",
@@ -151,7 +151,7 @@ class ExllamaModel(BaseLLMModel):
     concurrent_streams: bool = False
 
     @cached_property
-    def model_path_resolved(self):
+    def model_path_resolved(self) -> str:
         return path_resolver(
             self.model_path,
             default_relative_directory="models/gptq",
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 9e8b7f7..2d6792c 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -35,12 +35,14 @@
 from anyio.streams.memory import MemoryObjectSendStream
 from fastapi import APIRouter, Request
 from fastapi.concurrency import iterate_in_threadpool, run_in_threadpool
-from orjson import dumps
+from orjson import OPT_INDENT_2, dumps
 from sse_starlette.sse import EventSourceResponse
 
 from ...schemas.api import (
     ChatCompletion,
+    ChatCompletionChunk,
     Completion,
+    CompletionChunk,
     CreateChatCompletionRequest,
     CreateCompletionRequest,
     CreateEmbeddingRequest,
@@ -53,7 +55,7 @@
     run_in_processpool_with_wix,
 )
 from ...utils.errors import RouteErrorHandler
-from ...utils.logger import ApiLogger
+from ...utils.logger import ApiLogger, LoggingConfig
 from ..pools.llama import (
     generate_completion,
     generate_completion_chunks,
@@ -61,6 +63,14 @@
     get_model_names,
 )
 
+chat_logger = ApiLogger(
+    "",
+    logging_config=LoggingConfig(
+        console_log_level=100,
+        file_log_name="./logs/chat.log",
+        color=False,
+    ),
+)
 logger = ApiLogger(__name__)
 router = APIRouter(prefix="/v1", route_class=RouteErrorHandler)
 T = TypeVar("T")
@@ -73,6 +83,7 @@ class TaskStatus(TypedDict):
     started_at: float
     interrupted: bool
     embedding_chunks: Optional[int]
+    generated_text: str
 
 
 @dataclass
@@ -149,6 +160,24 @@ async def get_wix_with_semaphore(
     raise LookupError("No available wix")
 
 
+def get_text_from_completion(
+    completion: Union[Completion, ChatCompletion]
+) -> str:
+    """Get the generated text from a completion"""
+    if "text" in completion["choices"][0]:
+        return completion["choices"][0]["text"]
+    return completion["choices"][0]["message"]["content"]
+
+
+def get_text_from_chunk(
+    chunk: Union[CompletionChunk, ChatCompletionChunk]
+) -> str:
+    """Get the generated text from a completion chunk"""
+    if "text" in chunk["choices"][0]:
+        return chunk["choices"][0]["text"]
+    return chunk["choices"][0]["delta"].get("content", "")
+
+
 async def get_event_publisher(
     request: Request,
     body: Union[
@@ -158,7 +187,7 @@ async def get_event_publisher(
     inner_send_chan: MemoryObjectSendStream,
     task: "Task[None]",
     interrupt_signal: Event,
-    iterator: Iterator,
+    iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]],
 ) -> None:
     """Publish Server-Sent-Events (SSE) to the client"""
     with task_manager(
@@ -170,6 +199,7 @@ async def get_event_publisher(
             try:
                 async for chunk in iterate_in_threadpool(iterator):
                     task_status["completion_tokens"] += 1
+                    task_status["generated_text"] += get_text_from_chunk(chunk)
                     await inner_send_chan.send(
                         b"data: " + dumps(chunk) + b"\n\n"
                     )
@@ -198,6 +228,51 @@ def get_streaming_iterator(
         yield validate_item_type(gen, type=dict)
 
 
+def log_request(
+    body: Union[
+        CreateChatCompletionRequest,
+        CreateCompletionRequest,
+        CreateEmbeddingRequest,
+    ],
+    task_status: TaskStatus,
+) -> None:
+    body_without_prompt = body.model_dump(
+        exclude={"prompt", "messages", "input"},
+        exclude_defaults=True,
+        exclude_unset=True,
+        exclude_none=True,
+    )
+    if isinstance(body, CreateChatCompletionRequest):
+        chat_log = {
+            "request": body_without_prompt,
+            "chat": [
+                body.messages[i].model_dump(exclude_none=True)
+                for i in range(len(body.messages))
+            ]
+            + [
+                {
+                    "role": "assistant",
+                    "content": task_status["generated_text"],
+                }
+            ],
+        }
+    elif isinstance(body, CreateCompletionRequest):
+        chat_log = {
+            "request": body_without_prompt,
+            "prompt": {
+                "user": body.prompt,
+                "assistant": task_status["generated_text"],
+            },
+        }
+    else:
+        chat_log = {
+            "request": body_without_prompt,
+            "input": body.input,
+            "embedding": task_status["embedding_chunks"],
+        }
+    chat_logger.info(dumps(chat_log, option=OPT_INDENT_2).decode())
+
+
 @contextmanager
 def task_manager(
     body: Union[
@@ -215,10 +290,10 @@ def task_manager(
         started_at=time(),
         interrupted=False,
         embedding_chunks=None,
+        generated_text="",
     )
     try:
         logger.info(f"🦙 Handling request of {body.model}...")
-        logger.debug(f"🦙 Request body: {body}")
         yield task_status
     finally:
         # Cancel the producer task and set event,
@@ -246,6 +321,7 @@ def task_manager(
         logger.info(
             f"🦙 [{status} for {body.model}]: ({' | '.join(basic_messages)})"
         )
+        log_request(body=body, task_status=task_status)
 
 
 async def create_chat_completion_or_completion(
@@ -285,7 +361,7 @@ async def create_chat_completion_or_completion(
                     inner_send_chan=send_chan,
                     task=task,
                     interrupt_signal=interrupt_signal,
-                    iterator=get_streaming_iterator(
+                    iterator=get_streaming_iterator(  # type: ignore
                         queue=queue,
                         first_response=validate_item_type(
                             await run_in_threadpool(queue.get), type=dict
@@ -308,6 +384,9 @@ async def create_chat_completion_or_completion(
                 task_status["completion_tokens"] = completion["usage"][
                     "completion_tokens"
                 ]
+                task_status["generated_text"] = get_text_from_completion(
+                    completion
+                )
                 return completion
 
 
diff --git a/llama_api/utils/logger.py b/llama_api/utils/logger.py
index 7cfe2bf..dbefbbb 100644
--- a/llama_api/utils/logger.py
+++ b/llama_api/utils/logger.py
@@ -1,8 +1,8 @@
 """Logger module for the API"""
 
 import logging
-import os
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Dict, Optional
 
 from .colorama import Fore, Style
@@ -15,6 +15,7 @@ class LoggingConfig:
     file_log_level: Optional[int] = logging.DEBUG
     file_log_name: Optional[str] = "./logs/debug.log"
     logging_format: str = "[%(asctime)s] %(name)s:%(levelname)s - %(message)s"
+    color: bool = True
 
 
 class ColoredFormatter(logging.Formatter):
@@ -52,7 +53,11 @@ def __init__(
         self, name: str, logging_config: LoggingConfig = LoggingConfig()
     ) -> None:
         super().__init__(name=name, level=logging_config.logger_level)
-        formatter = ColoredFormatter(logging_config.logging_format)
+        formatter = (
+            ColoredFormatter(logging_config.logging_format)
+            if logging_config.color
+            else logging.Formatter(logging_config.logging_format)
+        )
 
         console = logging.StreamHandler()
         console.setLevel(logging_config.console_log_level)
@@ -62,10 +67,9 @@ def __init__(
             logging_config.file_log_name is not None
             and logging_config.file_log_level is not None
         ):
-            if not os.path.exists(
-                os.path.dirname(logging_config.file_log_name)
-            ):
-                os.makedirs(os.path.dirname(logging_config.file_log_name))
+            Path(logging_config.file_log_name).parent.mkdir(
+                parents=True, exist_ok=True
+            )
             file_handler = logging.FileHandler(
                 filename=logging_config.file_log_name,
                 mode="a",

From 778c0bd13075c4739ed3095aab06b7e70c508fc9 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Mon, 14 Aug 2023 23:26:42 +0900
Subject: [PATCH 08/15] Fixed bug: llama.cpp context tokens

---
 llama_api/modules/llama_cpp.py | 1 +
 llama_api/server/routers/v1.py | 2 ++
 llama_api/utils/errors.py      | 2 +-
 main.py                        | 8 ++++++++
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index 7de518d..d88e3e4 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -130,6 +130,7 @@ def from_pretrained(
             # Hacky way to pass arguments to older versions of llama-cpp-python
             if key in signature(llama_cpp.Llama.__init__).parameters.keys()
         }
+        kwargs["n_ctx"] = llm_model.max_total_tokens
         kwargs["model_path"] = llm_model.model_path_resolved
         kwargs["verbose"] = llm_model.verbose and llm_model.echo
         client = llama_cpp.Llama(**kwargs)
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 2d6792c..a7b2713 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -410,6 +410,8 @@ async def create_completion(request: Request, body: CreateCompletionRequest):
 async def create_embedding(
     body: CreateEmbeddingRequest,
 ) -> Embedding:
+    if not environ.get("LLAMA_API_EMBEDDINGS"):
+        raise PermissionError("Embeddings endpoint is disabled")
     assert body.model is not None, "Model is required"
     async with get_wix_with_semaphore(body.model) as wix:
         queue, interrupt_signal = get_queue_and_event()
diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index dc7f4e2..3949d51 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -206,7 +206,7 @@ async def custom_route_handler(self, request: Request) -> Response:
                         {"error": error_response},
                         status_code=401,
                     )
-                if authorization != self.authorization:
+                if authorization.lower() != self.authorization.lower():
                     api_key = authorization[len("Bearer ") :]  # noqa: E203
                     error_response = ErrorResponse(
                         message=(
diff --git a/main.py b/main.py
index 4215d49..7f8bd16 100644
--- a/main.py
+++ b/main.py
@@ -19,11 +19,13 @@
         help="Maximum number of process workers to run; default is 1",
     )
     parser.add_argument(
+        "-i",
         "--install-pkgs",
         action="store_true",
         help="Install all required packages before running the server",
     )
     parser.add_argument(
+        "-c",
         "--force-cuda",
         action="store_true",
         help=(
@@ -59,6 +61,11 @@
         action="store_true",
         help="Apply xformers' memory-efficient optimizations",
     )
+    parser.add_argument(
+        "--disable-embeddings",
+        action="store_true",
+        help="Disable embeddings endpoint",
+    )
 
     args = parser.parse_args()
     run(
@@ -73,5 +80,6 @@
             "LLAMA_API_XFORMERS": "1" if args.xformers else "",
             "LLAMA_API_API_KEY": args.api_key or "",
             "FORCE_CUDA": "1" if args.force_cuda else "",
+            "LLAMA_API_EMBEDDINGS": "1" if not args.disable_embeddings else "",
         },
     )

From fcb0d58f8d726bc7e4c6c149b18c1b95d88c4f01 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Mon, 14 Aug 2023 23:29:48 +0900
Subject: [PATCH 09/15] Removed assertion: api key should start with "sk-"

---
 llama_api/utils/errors.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index 3949d51..e647c79 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -133,8 +133,6 @@ def authorization(self) -> Optional[str]:
         """API key for authentication"""
         if self.api_key is None:
             return None
-        if not self.api_key.startswith("sk-"):
-            self.api_key = f"sk-{self.api_key}"
         return f"Bearer {self.api_key}"
 
     def error_message_wrapper(

From b85de0e27dab3ad34cd6d4c574780d413a443068 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 15 Aug 2023 21:14:12 +0900
Subject: [PATCH 10/15] Improved worker load balancing

---
 llama_api/server/routers/v1.py | 116 ++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 61 deletions(-)

diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index a7b2713..9a5ec15 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -8,6 +8,7 @@
 from functools import partial
 from os import environ
 from queue import Queue
+from random import choice
 from threading import Event
 from time import time
 from typing import (
@@ -66,13 +67,13 @@
 chat_logger = ApiLogger(
     "",
     logging_config=LoggingConfig(
-        console_log_level=100,
-        file_log_name="./logs/chat.log",
-        color=False,
+        console_log_level=100, file_log_name="./logs/chat.log", color=False
     ),
 )
 logger = ApiLogger(__name__)
 router = APIRouter(prefix="/v1", route_class=RouteErrorHandler)
+max_workers = int(environ.get("LLAMA_API_MAX_WORKERS", 1))
+max_semaphores = int(environ.get("LLAMA_API_MAX_SEMAPHORES", 1))
 T = TypeVar("T")
 
 
@@ -90,18 +91,58 @@ class TaskStatus(TypedDict):
 class WixMetadata:
     """Worker index (wix) metadata"""
 
-    key: Optional[str] = None
-    semaphore: Semaphore = field(default_factory=lambda: Semaphore(1))
+    wix: int
+    processed_key: Optional[str] = None
+    semaphore: Semaphore = field(
+        default_factory=lambda: Semaphore(max_semaphores)
+    )
 
 
 # Worker index (wix) is used to keep track of which worker is currently
 # processing a request. This is used to prevent multiple requests from
 # creating multiple completion generators at the same time.
-wixs: Tuple[WixMetadata] = tuple(
-    WixMetadata() for _ in range(int(environ.get("LLAMA_API_MAX_WORKERS", 1)))
+wix_metas: Tuple[WixMetadata] = tuple(
+    WixMetadata(wix) for wix in range(max_workers)
 )
 
 
+def get_worker_rank(meta: WixMetadata, request_key: Optional[str]) -> int:
+    """Get the entry rank for the worker index (wix) metadata.
+    Lower rank means higher priority of the worker to process the request."""
+    global max_semaphores
+    if request_key == meta.processed_key:
+        # If the key is the same (worker is processing the same model)
+        return -2  # return the highest priority
+    if request_key is None or meta.processed_key is None:
+        # If not requesting a specific model or the worker is not processing
+        return -1  # return the second highest priority
+    return (
+        max_semaphores - meta.semaphore.value
+    )  # return the number of slots in use
+
+
+@asynccontextmanager
+async def get_wix_with_semaphore(
+    request: Request,
+    request_key: Optional[str] = None,
+) -> AsyncGenerator[int, None]:
+    """Get the worker index (wix) for the key and acquire the semaphore"""
+    global wix_metas
+    worker_ranks = [
+        get_worker_rank(wix_meta, request_key) for wix_meta in wix_metas
+    ]
+    min_rank = min(worker_ranks)
+    candidates = [i for i, rank in enumerate(worker_ranks) if rank == min_rank]
+    if not candidates:
+        raise LookupError("No available wix")
+    wix_meta = wix_metas[choice(candidates)]
+    async with wix_meta.semaphore:
+        if await request.is_disconnected():
+            raise get_cancelled_exc_class()()
+        wix_meta.processed_key = request_key
+        yield wix_meta.wix
+
+
 def validate_item_type(item: Any, type: Type[T]) -> T:
     """Validate that the item is of the correct type"""
     if isinstance(item, Exception):
@@ -113,53 +154,6 @@ def validate_item_type(item: Any, type: Type[T]) -> T:
     return item
 
 
-@asynccontextmanager
-async def get_wix_with_semaphore(
-    key: Optional[str] = None,
-) -> AsyncGenerator[int, None]:
-    """Get the worker index (wix) for the key and acquire the semaphore"""
-    if key is None:
-        # Find the first available slot
-        for wix, wix_metadata in enumerate(wixs):
-            if wix_metadata.semaphore.value:
-                async with wix_metadata.semaphore:
-                    wix_metadata.key = key
-                    yield wix
-                    return
-    else:
-        # Get the worker index (wix) for the key
-        for wix, wix_metadata in enumerate(wixs):
-            if wix_metadata.key == key:
-                async with wix_metadata.semaphore:
-                    yield wix
-                    return
-
-        # If the key is not in the wixs, find the first empty slot
-        for wix, wix_metadata in enumerate(wixs):
-            if wix_metadata.key is None:
-                async with wix_metadata.semaphore:
-                    wix_metadata.key = key
-                    yield wix
-                    return
-
-        # If there are no empty slot, find available slot
-        for wix, wix_metadata in enumerate(wixs):
-            if wix_metadata.semaphore.value:
-                async with wix_metadata.semaphore:
-                    wix_metadata.key = key
-                    yield wix
-                    return
-
-    # If there are no available slot, wait for one to become available
-    for wix, wix_metadata in enumerate(wixs):
-        async with wix_metadata.semaphore:
-            wix_metadata.key = key
-            yield wix
-            return
-
-    raise LookupError("No available wix")
-
-
 def get_text_from_completion(
     completion: Union[Completion, ChatCompletion]
 ) -> str:
@@ -228,7 +222,7 @@ def get_streaming_iterator(
         yield validate_item_type(gen, type=dict)
 
 
-def log_request(
+def log_request_and_response(
     body: Union[
         CreateChatCompletionRequest,
         CreateCompletionRequest,
@@ -321,7 +315,7 @@ def task_manager(
         logger.info(
             f"🦙 [{status} for {body.model}]: ({' | '.join(basic_messages)})"
         )
-        log_request(body=body, task_status=task_status)
+        log_request_and_response(body=body, task_status=task_status)
 
 
 async def create_chat_completion_or_completion(
@@ -332,7 +326,7 @@ async def create_chat_completion_or_completion(
     If the body is a chat completion, then create a chat completion.
     If the body is a completion, then create a completion.
     If streaming is enabled, then return an EventSourceResponse."""
-    async with get_wix_with_semaphore(body.model) as wix:
+    async with get_wix_with_semaphore(request, body.model) as wix:
         queue, interrupt_signal = get_queue_and_event()
         producer: Callable[
             [
@@ -408,12 +402,12 @@ async def create_completion(request: Request, body: CreateCompletionRequest):
 
 @router.post("/embeddings")
 async def create_embedding(
-    body: CreateEmbeddingRequest,
+    request: Request, body: CreateEmbeddingRequest
 ) -> Embedding:
     if not environ.get("LLAMA_API_EMBEDDINGS"):
         raise PermissionError("Embeddings endpoint is disabled")
     assert body.model is not None, "Model is required"
-    async with get_wix_with_semaphore(body.model) as wix:
+    async with get_wix_with_semaphore(request, body.model) as wix:
         queue, interrupt_signal = get_queue_and_event()
         producer: Callable[
             [CreateEmbeddingRequest, Queue, Event],
@@ -441,8 +435,8 @@ async def create_embedding(
 
 
 @router.get("/models")
-async def get_models() -> ModelList:
-    async with get_wix_with_semaphore() as wix:
+async def get_models(request: Request) -> ModelList:
+    async with get_wix_with_semaphore(request) as wix:
         return ModelList(
             object="list",
             data=[

From 6b2e37fe67ac490296de1800a1f5d9592b10677a Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 15 Aug 2023 21:14:44 +0900
Subject: [PATCH 11/15] bump dependencies

---
 poetry.lock      | 121 ++++++++++++++++++++++++++++++++---------------
 pyproject.toml   |   5 +-
 requirements.txt |  15 +++---
 3 files changed, 94 insertions(+), 47 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 6e0f091..a969ac9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -362,28 +362,28 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [[package]]
 name = "cmake"
-version = "3.27.1"
+version = "3.27.2"
 description = "CMake is an open-source, cross-platform family of tools designed to build, test and package software"
 optional = false
 python-versions = "*"
 files = [
-    {file = "cmake-3.27.1-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:c62c5a6d42e68eb955fc321f7bc84290e4c4771ee7e5301c2eaa9586c874fd8e"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2010_i686.manylinux_2_12_i686.whl", hash = "sha256:18ef1c579cb4c94ece6bbb7c7f3e0170b078bf787f0a372194f0921e79f6098c"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:ad3aca0d94abe6313a7b1c65b8b3d7eb3158786fd1dd6a9f8c42f82850fb974c"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:50bfe69d369a61eb63e5b8af76a2383cf312d1e8449bd797d563f6c62809d317"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:7fb6d9183b90d4cc4db7b022aa7c9ef3431d281aea29ca259de7199bc75b7e09"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:2583464302ecc287619578627e26962386a41a98bbf1fb4c8c90d600ec1a1be5"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:7a5431c7ca0b7145b857dd0eab26f4f9ec42661bb67afa6d437b3e48532b8e3a"},
-    {file = "cmake-3.27.1-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1fb6d882bbd7e77fad206dfdbcaf880f4bcd7e8d0c23b37058ee155715bd19ed"},
-    {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:cee7dd0bcc5bd14d94ecdbbf9883b17f3001adc5f696b7d8eba0482354e5e017"},
-    {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_i686.whl", hash = "sha256:82a6f57449e7bf9b510ed82b29982e4eec8b5c5e80a51208368dc1aa58b8181b"},
-    {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_ppc64le.whl", hash = "sha256:7052bb12c3492083169269fee7c7a11c053cae35949346b12d2998b971602b78"},
-    {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_s390x.whl", hash = "sha256:482e7018fc8d9bc98e7f30b5071c021ca0e27b131dd61900395abfd768c3fe29"},
-    {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:a112dd298b8ac598fef3653dff1592ba4c0f9bf7fe75b77ab44a6edfcceb96d4"},
-    {file = "cmake-3.27.1-py2.py3-none-win32.whl", hash = "sha256:b9d68783ea01775d6d4ea220d3b4e90d5e287cf9a1db09c5a9b78c7748e1c3d0"},
-    {file = "cmake-3.27.1-py2.py3-none-win_amd64.whl", hash = "sha256:628f75286475b89d6566db62c0869de5f0a07ad9bba10bebe6a48012fa1ee777"},
-    {file = "cmake-3.27.1-py2.py3-none-win_arm64.whl", hash = "sha256:ee7a47e37a29b8124d9125a8c390fb94822a2695d80151560004d4f4f78c0ad7"},
-    {file = "cmake-3.27.1.tar.gz", hash = "sha256:7ee6af09b2b575a491483b72927ee7e4beb59e7fb86e6d905a7027607a3f367e"},
+    {file = "cmake-3.27.2-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:96ac856c4d6b2104408848f0005a8ab2229d4135b171ea9a03e8c33039ede420"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2010_i686.manylinux_2_12_i686.whl", hash = "sha256:11fe6129d07982721c5965fd804a4056b8c6e9c4f482ac9e0fe41bb3abc1ab5f"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:f0c64e89e2ea59592980c4fe3821d712fee0e74cf87c2aaec5b3ab9aa809a57c"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ca7650477dff2a1138776b28b79c0e99127be733d3978922e8f87b56a433eed6"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ab2e40fe09e76a7ef67da2bbbf7a4cd1f52db4f1c7b6ccdda2539f918830343a"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:980ee19f12c808cb8ddb56fdcee832501a9f9631799d8b4fc625c0a0b5fb4c55"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:115d30ca0760e3861d9ad6b3288cd11ee72a785b81227da0c1765d3b84e2c009"},
+    {file = "cmake-3.27.2-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efc338c939d6d435890a52458a260bf0942bd8392b648d7532a72c1ec0764e18"},
+    {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:7f7438c60ccc01765b67abfb1797787c3b9459d500a804ed70a4cc181bc02204"},
+    {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_i686.whl", hash = "sha256:294f008734267e0eee1574ad1b911bed137bc907ab19d60a618dab4615aa1fca"},
+    {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_ppc64le.whl", hash = "sha256:197a34dc62ee149ced343545fac67e5a30b93fda65250b065726f86ce92bdada"},
+    {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_s390x.whl", hash = "sha256:afb46ad883b174fb64347802ba5878423551dbd5847bb64669c39a5957c06eb7"},
+    {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:83611ffd155e270a6b13bbf0cfd4e8688ebda634f448aa2e3734006c745bf33f"},
+    {file = "cmake-3.27.2-py2.py3-none-win32.whl", hash = "sha256:53e12deb893da935e236f93accd47dbe2806620cd7654986234dc4487cc49652"},
+    {file = "cmake-3.27.2-py2.py3-none-win_amd64.whl", hash = "sha256:611f9722c68c40352d38a6c01960ab038c3d0419e7aee3bf18f95b23031e0dfe"},
+    {file = "cmake-3.27.2-py2.py3-none-win_arm64.whl", hash = "sha256:30620326b51ac2ce0d8f476747af6367a7ea21075c4d065fad9443904b07476a"},
+    {file = "cmake-3.27.2.tar.gz", hash = "sha256:7cd6e2d7d5a1125f8c26c4f65214f8c942e3f276f98c16cb62ae382c35609f25"},
 ]
 
 [package.extras]
@@ -565,13 +565,13 @@ pgp = ["gpg"]
 
 [[package]]
 name = "exceptiongroup"
-version = "1.1.2"
+version = "1.1.3"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"},
-    {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"},
+    {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"},
+    {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"},
 ]
 
 [package.extras]
@@ -579,17 +579,17 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "fastapi"
-version = "0.100.1"
+version = "0.101.1"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "fastapi-0.100.1-py3-none-any.whl", hash = "sha256:ec6dd52bfc4eff3063cfcd0713b43c87640fefb2687bbbe3d8a08d94049cdf32"},
-    {file = "fastapi-0.100.1.tar.gz", hash = "sha256:522700d7a469e4a973d92321ab93312448fbe20fca9c8da97effc7e7bc56df23"},
+    {file = "fastapi-0.101.1-py3-none-any.whl", hash = "sha256:aef5f8676eb1b8389952e1fe734abe20f04b71f6936afcc53b320ba79b686a4b"},
+    {file = "fastapi-0.101.1.tar.gz", hash = "sha256:7b32000d14ca9992f7461117b81e4ef9ff0c07936af641b4fe40e67d5f9d63cb"},
 ]
 
 [package.dependencies]
-pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<3.0.0"
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
 starlette = ">=0.27.0,<0.28.0"
 typing-extensions = ">=4.5.0"
 
@@ -1855,13 +1855,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
 name = "pydantic-settings"
-version = "2.0.2"
+version = "2.0.3"
 description = "Settings management using Pydantic"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic_settings-2.0.2-py3-none-any.whl", hash = "sha256:6183a2abeab465d5a3ab69758e9a22d38b0cc2ba193f0b85f6971a252ea630f6"},
-    {file = "pydantic_settings-2.0.2.tar.gz", hash = "sha256:342337fff50b23585e807a86dec85037900972364435c55c2fc00d16ff080539"},
+    {file = "pydantic_settings-2.0.3-py3-none-any.whl", hash = "sha256:ddd907b066622bd67603b75e2ff791875540dc485b7307c4fffc015719da8625"},
+    {file = "pydantic_settings-2.0.3.tar.gz", hash = "sha256:962dc3672495aad6ae96a4390fac7e593591e144625e5112d359f8f67fb75945"},
 ]
 
 [package.dependencies]
@@ -2640,13 +2640,13 @@ files = [
 
 [[package]]
 name = "sse-starlette"
-version = "1.6.1"
+version = "1.6.5"
 description = "\"SSE plugin for Starlette\""
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "sse-starlette-1.6.1.tar.gz", hash = "sha256:6208af2bd7d0887c92f1379da14bd1f4db56bd1274cc5d36670c683d2aa1de6a"},
-    {file = "sse_starlette-1.6.1-py3-none-any.whl", hash = "sha256:d8f18f1c633e355afe61cc5e9c92eea85badcb8b2d56ec8cfb0a006994aa55da"},
+    {file = "sse-starlette-1.6.5.tar.gz", hash = "sha256:819f2c421fb37067380fe3dcaba246c476b02651b7bb7601099a378ad802a0ac"},
+    {file = "sse_starlette-1.6.5-py3-none-any.whl", hash = "sha256:68b6b7eb49be0c72a2af80a055994c13afcaa4761b29226beb208f954c25a642"},
 ]
 
 [package.dependencies]
@@ -2684,6 +2684,51 @@ files = [
 numpy = ">=1.12.0"
 protobuf = ">=3.19.6"
 
+[[package]]
+name = "tiktoken"
+version = "0.4.0"
+description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tiktoken-0.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:176cad7f053d2cc82ce7e2a7c883ccc6971840a4b5276740d0b732a2b2011f8a"},
+    {file = "tiktoken-0.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:450d504892b3ac80207700266ee87c932df8efea54e05cefe8613edc963c1285"},
+    {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00d662de1e7986d129139faf15e6a6ee7665ee103440769b8dedf3e7ba6ac37f"},
+    {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5727d852ead18b7927b8adf558a6f913a15c7766725b23dbe21d22e243041b28"},
+    {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c06cd92b09eb0404cedce3702fa866bf0d00e399439dad3f10288ddc31045422"},
+    {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9ec161e40ed44e4210d3b31e2ff426b4a55e8254f1023e5d2595cb60044f8ea6"},
+    {file = "tiktoken-0.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:1e8fa13cf9889d2c928b9e258e9dbbbf88ab02016e4236aae76e3b4f82dd8288"},
+    {file = "tiktoken-0.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb2341836b725c60d0ab3c84970b9b5f68d4b733a7bcb80fb25967e5addb9920"},
+    {file = "tiktoken-0.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ca30367ad750ee7d42fe80079d3092bd35bb266be7882b79c3bd159b39a17b0"},
+    {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3dc3df19ddec79435bb2a94ee46f4b9560d0299c23520803d851008445671197"},
+    {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d980fa066e962ef0f4dad0222e63a484c0c993c7a47c7dafda844ca5aded1f3"},
+    {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:329f548a821a2f339adc9fbcfd9fc12602e4b3f8598df5593cfc09839e9ae5e4"},
+    {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b1a038cee487931a5caaef0a2e8520e645508cde21717eacc9af3fbda097d8bb"},
+    {file = "tiktoken-0.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:08efa59468dbe23ed038c28893e2a7158d8c211c3dd07f2bbc9a30e012512f1d"},
+    {file = "tiktoken-0.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3020350685e009053829c1168703c346fb32c70c57d828ca3742558e94827a9"},
+    {file = "tiktoken-0.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba16698c42aad8190e746cd82f6a06769ac7edd415d62ba027ea1d99d958ed93"},
+    {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c15d9955cc18d0d7ffcc9c03dc51167aedae98542238b54a2e659bd25fe77ed"},
+    {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64e1091c7103100d5e2c6ea706f0ec9cd6dc313e6fe7775ef777f40d8c20811e"},
+    {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e87751b54eb7bca580126353a9cf17a8a8eaadd44edaac0e01123e1513a33281"},
+    {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e063b988b8ba8b66d6cc2026d937557437e79258095f52eaecfafb18a0a10c03"},
+    {file = "tiktoken-0.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:9c6dd439e878172dc163fced3bc7b19b9ab549c271b257599f55afc3a6a5edef"},
+    {file = "tiktoken-0.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8d1d97f83697ff44466c6bef5d35b6bcdb51e0125829a9c0ed1e6e39fb9a08fb"},
+    {file = "tiktoken-0.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b6bce7c68aa765f666474c7c11a7aebda3816b58ecafb209afa59c799b0dd2d"},
+    {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a73286c35899ca51d8d764bc0b4d60838627ce193acb60cc88aea60bddec4fd"},
+    {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0394967d2236a60fd0aacef26646b53636423cc9c70c32f7c5124ebe86f3093"},
+    {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:dae2af6f03ecba5f679449fa66ed96585b2fa6accb7fd57d9649e9e398a94f44"},
+    {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55e251b1da3c293432179cf7c452cfa35562da286786be5a8b1ee3405c2b0dd2"},
+    {file = "tiktoken-0.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:c835d0ee1f84a5aa04921717754eadbc0f0a56cf613f78dfc1cf9ad35f6c3fea"},
+    {file = "tiktoken-0.4.0.tar.gz", hash = "sha256:59b20a819969735b48161ced9b92f05dc4519c17be4015cfb73b65270a243620"},
+]
+
+[package.dependencies]
+regex = ">=2022.1.18"
+requests = ">=2.26.0"
+
+[package.extras]
+blobfile = ["blobfile (>=2)"]
+
 [[package]]
 name = "tokenizers"
 version = "0.13.3"
@@ -2762,13 +2807,13 @@ files = [
 
 [[package]]
 name = "tqdm"
-version = "4.66.0"
+version = "4.66.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.0-py3-none-any.whl", hash = "sha256:39d459c7140b7890174e69d4d68d6291bc774a55b4bc5d93c0b760798ac5a03e"},
-    {file = "tqdm-4.66.0.tar.gz", hash = "sha256:cc6e7e52202d894e66632c5c8a9330bd0e3ff35d2965c93ca832114a3d865362"},
+    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
+    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
 ]
 
 [package.dependencies]
@@ -2981,13 +3026,13 @@ test = ["Cython (>=0.29.32,<0.30.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "my
 
 [[package]]
 name = "virtualenv"
-version = "20.24.2"
+version = "20.24.3"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "virtualenv-20.24.2-py3-none-any.whl", hash = "sha256:43a3052be36080548bdee0b42919c88072037d50d56c28bd3f853cbe92b953ff"},
-    {file = "virtualenv-20.24.2.tar.gz", hash = "sha256:fd8a78f46f6b99a67b7ec5cf73f92357891a7b3a40fd97637c27f854aae3b9e0"},
+    {file = "virtualenv-20.24.3-py3-none-any.whl", hash = "sha256:95a6e9398b4967fbcb5fef2acec5efaf9aa4972049d9ae41f95e0972a683fd02"},
+    {file = "virtualenv-20.24.3.tar.gz", hash = "sha256:e5c3b4ce817b0b328af041506a2a299418c98747c4b1e68cb7527e74ced23efc"},
 ]
 
 [package.dependencies]
@@ -3264,4 +3309,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<3.12"
-content-hash = "09b071c03e16e84be7d44b7cfc1670a2035ac8f96c5894cfd11726355a9fe3b4"
+content-hash = "7bd21a07f403c13e49b67e13595f46dbac47bfdb4d1d0fc5fd2f40f08e62f886"
diff --git a/pyproject.toml b/pyproject.toml
index 2d80eb7..ef68833 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,14 +15,14 @@ python = ">=3.8.1,<3.12"
 poetry = "^1.5.1"
 
 uvicorn = { extras = ["standard"], version = "^0.23" }
-fastapi = "^0.100.1"
+fastapi = ">=0.100.1"
 orjson = "^3.9"
 sse-starlette = "^1.6"
 psutil = "^5.9"
 cmake = ">=3.18.0"
 filelock = "^3.12"
 transformers = "^4.31.0"
-tensorflow-hub = "^0.14"
+tensorflow-hub = ">=0.14"
 numpy = "^1.24.3"
 safetensors = "^0.3.1"
 ninja = "^1.11.1"
@@ -31,6 +31,7 @@ pydantic = "^2.0.0"
 pydantic-settings = "^2.0.0"
 sentencepiece = ">=0.1.97"
 typing-extensions = ">=4.0.0"
+tiktoken = ">=0.4.0"
 # torch: 2.0.1+cu118 for GPU, 2.0.1+cpu for CPU
 
 [tool.poetry.group.dev.dependencies]
diff --git a/requirements.txt b/requirements.txt
index ac45196..7801854 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,15 +8,15 @@ cffi==1.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" and (s
 charset-normalizer==3.2.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cleo==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 click==8.1.6 ; python_full_version >= "3.8.1" and python_version < "3.12"
-cmake==3.27.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
+cmake==3.27.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 colorama==0.4.6 ; python_full_version >= "3.8.1" and python_version < "3.12" and (os_name == "nt" or platform_system == "Windows")
 crashtest==0.4.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cryptography==41.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux"
 diskcache==5.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 distlib==0.3.7 ; python_full_version >= "3.8.1" and python_version < "3.12"
 dulwich==0.21.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
-exceptiongroup==1.1.2 ; python_full_version >= "3.8.1" and python_version < "3.11"
-fastapi==0.100.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
+exceptiongroup==1.1.3 ; python_full_version >= "3.8.1" and python_version < "3.11"
+fastapi==0.101.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 filelock==3.12.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 fsspec==2023.6.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 h11==0.14.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
@@ -51,7 +51,7 @@ psutil==5.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 ptyprocess==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pycparser==2.21 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux")
 pydantic-core==2.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
-pydantic-settings==2.0.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
+pydantic-settings==2.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pydantic==2.1.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pyproject-hooks==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 python-dotenv==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
@@ -69,20 +69,21 @@ sentencepiece==0.1.99 ; python_full_version >= "3.8.1" and python_version < "3.1
 shellingham==1.5.0.post1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 six==1.16.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 sniffio==1.3.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
-sse-starlette==1.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
+sse-starlette==1.6.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 starlette==0.27.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 tensorflow-hub==0.14.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+tiktoken==0.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 tokenizers==0.13.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
 tomli==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.11"
 tomlkit==0.12.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-tqdm==4.66.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+tqdm==4.66.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 transformers==4.31.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 trove-classifiers==2023.8.7 ; python_full_version >= "3.8.1" and python_version < "3.12"
 typing-extensions==4.7.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 urllib3==1.26.16 ; python_full_version >= "3.8.1" and python_version < "3.12"
 uvicorn[standard]==0.23.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 uvloop==0.17.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_full_version >= "3.8.1" and python_version < "3.12"
-virtualenv==20.24.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
+virtualenv==20.24.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
 watchfiles==0.19.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 webencodings==0.5.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 websockets==11.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12"

From 086658326e68f4dcec425f659535449c01639e3b Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 15 Aug 2023 21:15:38 +0900
Subject: [PATCH 12/15] Implemented OpenAI compatible logit bias

---
 llama_api/logits/bias.py        | 64 ++++++++++++++++++++++++---------
 llama_api/mixins/logits.py      |  5 +--
 llama_api/modules/exllama.py    | 14 +++++---
 llama_api/schemas/api.py        | 12 +++----
 llama_api/server/pools/llama.py |  1 +
 5 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/llama_api/logits/bias.py b/llama_api/logits/bias.py
index ebdae0f..5c716e4 100644
--- a/llama_api/logits/bias.py
+++ b/llama_api/logits/bias.py
@@ -1,10 +1,30 @@
-from typing import TYPE_CHECKING, Callable, Dict, List, Literal, Optional
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Dict,
+    List,
+    Optional,
+)
 
+from ..utils.logger import ApiLogger
 from .base import BaseLogitProcessor
 
 if TYPE_CHECKING:
     import torch as pytorch
 
+logger = ApiLogger(__name__)
+
+try:
+    import tiktoken
+
+    openai_decoder = tiktoken.get_encoding("cl100k_base").decode
+except Exception as e:
+    logger.warning(
+        "Could not load tiktoken, which is required for OpenAI GPT models. "
+        f"Please `pip install tiktoken` to use the OpenAI encoder: {e}"
+    )
+    openai_decoder: Optional[Callable[[List[int]], str]] = None
+
 
 class LogitBiasProcessor(BaseLogitProcessor):
     """Create a logit bias processor to bias the logit scores."""
@@ -12,23 +32,33 @@ class LogitBiasProcessor(BaseLogitProcessor):
     def __init__(
         self,
         logit_bias: Dict[str, float],
-        logit_bias_type: Optional[Literal["input_ids", "tokens"]],
         encoder: Callable[[str], List[int]],
+        is_openai: bool = False,
     ):
-        if logit_bias_type is None:
-            logit_bias_type = "input_ids"
+        """Create a logit bias processor to bias the logit scores."""
+
+        global openai_decoder
 
-        to_bias = {}  # type: Dict[int, float]
-        if logit_bias_type == "input_ids":
-            for input_id_string, score in logit_bias.items():
-                to_bias[int(input_id_string)] = score
+        biases = {}  # type: Dict[int, float]
+        for id_or_token, bias in logit_bias.items():
+            is_digit = id_or_token.isdigit()
 
-        elif logit_bias_type == "tokens":
-            for token, score in logit_bias.items():
-                for input_id in encoder(token):
-                    to_bias[input_id] = score
+            if is_digit and is_openai and openai_decoder is not None:
+                # If we have an OpenAI id, we need to convert it to a token
+                # and then encode the token to get the ids
+                for id in encoder(openai_decoder([int(id_or_token)])):
+                    if abs(bias) > abs(biases.get(id, 0.0)):
+                        biases[id] = bias
+            elif is_digit:
+                # If we have a digit, we can just use it directly
+                biases[int(id_or_token)] = bias
+            else:
+                # Otherwise, we need to encode the token and use the ids
+                for id in encoder(id_or_token):
+                    if abs(bias) > abs(biases.get(id, 0.0)):
+                        biases[id] = bias
 
-        self._to_bias = to_bias
+        self._biases = biases
         self._bias_tensor = None
 
     def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor":
@@ -38,8 +68,8 @@ def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor":
             self._bias_tensor = torch.zeros(
                 scores.shape[-1], dtype=scores.dtype, device=scores.device
             )
-            for idx, value in self._to_bias.items():
-                self._bias_tensor[idx] = value
+            for id, bias in self._biases.items():
+                self._bias_tensor[id] = bias
 
         return self._bias_tensor
 
@@ -51,6 +81,6 @@ def with_torch(
     def without_torch(
         self, input_ids: List[int], scores: List[float]
     ) -> List[float]:
-        for id, biased_score in self._to_bias.items():
-            scores[id] += biased_score
+        for id, bias in self._biases.items():
+            scores[id] += bias
         return scores
diff --git a/llama_api/mixins/logits.py b/llama_api/mixins/logits.py
index 75867a1..b90bfce 100644
--- a/llama_api/mixins/logits.py
+++ b/llama_api/mixins/logits.py
@@ -9,7 +9,8 @@
 class LogitsMixin:
     @staticmethod
     def get_logit_processors(
-        settings: TextGenerationSettings, encoder: Callable[[str], List[int]]
+        settings: TextGenerationSettings,
+        encoder: Callable[[str], List[int]],
     ) -> List[BaseLogitProcessor]:
         logit_processors: List[BaseLogitProcessor] = []
         if settings.muse:
@@ -27,8 +28,8 @@ def get_logit_processors(
                 0,
                 LogitBiasProcessor(
                     logit_bias=settings.logit_bias,
-                    logit_bias_type=settings.logit_bias_type,
                     encoder=encoder,
+                    is_openai=settings.is_openai,
                 ),
             )
         return logit_processors
diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index 7d9e960..df8564f 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -322,8 +322,13 @@ def _generate_text_with_streaming(
         generator = _apply_settings_to_generator(cg, settings=settings)
 
         # Start the generator
+        context_window = cg.llm_model.max_total_tokens
         if settings.guidance_scale == 1:
             ids = _encode(cg.tokenizer, prompt)
+            prompt_tokens = ids.shape[-1]
+            cg.raise_for_token_limit(
+                prompt_tokens=prompt_tokens, context_window=context_window
+            )
             mask = None  # type: Optional[Tensor]
             generator.end_beam_search()
             generator.gen_begin_reuse(ids)
@@ -333,13 +338,12 @@ def _generate_text_with_streaming(
                 [prompt, settings.negative_prompt or ""],
                 return_mask=True,
             )
+            prompt_tokens = ids.shape[-1]
+            cg.raise_for_token_limit(
+                prompt_tokens=prompt_tokens, context_window=context_window
+            )
             generator.gen_begin(ids, mask=mask)
 
-        prompt_tokens = ids.shape[-1]
-        context_window = cg.llm_model.max_total_tokens
-        cg.raise_for_token_limit(
-            prompt_tokens=prompt_tokens, context_window=context_window
-        )
         settings.max_tokens = min(
             settings.max_tokens, context_window - prompt_tokens
         )
diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py
index 543c76c..e052324 100644
--- a/llama_api/schemas/api.py
+++ b/llama_api/schemas/api.py
@@ -189,14 +189,6 @@ class TextGenerationSettings(BaseModel):
             "logits of the model to influence."
         ),
     )
-    logit_bias_type: Literal["input_ids", "tokens"] = Field(
-        default="tokens",
-        description=(
-            "The type of logit bias to use. If 'input_ids', the bias is applied to the input"
-            " ids(integer). If 'tokens', the bias is applied to the tokens(string). If None, the bias is not "
-            "applied."
-        ),
-    )
     ban_eos_token: bool = Field(
         default=False,
         description="If True, the EOS token is banned from being generated.",
@@ -219,6 +211,10 @@ class TextGenerationSettings(BaseModel):
         "The negative prompt is used to encourage the model not to generate samples that are too similar to the "
         "negative prompt. CFG is enabled by setting `guidance_scale > 1`.",
     )
+    is_openai: bool = Field(
+        default=False,
+        description="If True, the model is regarded as an OpenAI model.",
+    )
 
 
 class CreateEmbeddingRequest(BaseModel):
diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
index eca2af2..5d1751e 100644
--- a/llama_api/server/pools/llama.py
+++ b/llama_api/server/pools/llama.py
@@ -94,6 +94,7 @@ def get_completion_generator(
         )
         if body.model in openai_replacement_models:
             body.model = openai_replacement_models[body.model]
+            body.is_openai = True
 
         # Check if the model is defined in LLMModels enum
         llm_model = get_model(body.model)

From fbb5d0a04bcf6f7ef37f342db57dd05178183bb2 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Wed, 16 Aug 2023 01:21:51 +0900
Subject: [PATCH 13/15] Better error logger

---
 build_shared_lib.py                       | 15 ++++++
 llama_api/server/pools/llama.py           | 25 ++++-----
 llama_api/server/routers/v1.py            |  8 +--
 llama_api/utils/huggingface_downloader.py |  4 +-
 llama_api/utils/llama_cpp.py              | 14 +----
 llama_api/utils/logger.py                 | 64 ++++++++++++++++++++---
 llama_api/utils/path.py                   |  9 +---
 7 files changed, 90 insertions(+), 49 deletions(-)
 create mode 100644 build_shared_lib.py

diff --git a/build_shared_lib.py b/build_shared_lib.py
new file mode 100644
index 0000000..819f819
--- /dev/null
+++ b/build_shared_lib.py
@@ -0,0 +1,15 @@
+# flake8: noqa
+
+from llama_api.utils.llama_cpp import (
+    build_shared_lib,
+    CPU_ARGS,  # Only use CPU
+    METAL_ARGS,  # Only use Metal (MacOS)
+    CUBLAS_ARGS,  # Only use CUBLAS (Nvidia)
+)
+from os import environ
+
+
+if __name__ == "__main__":
+    environ["FORCE_CMAKE"] = "1"
+    environ["CMAKE_ARGS"] = CPU_ARGS  # EDIT THIS LINE TO CHANGE BUILD TYPE !!!
+    build_shared_lib()
diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
index 5d1751e..c3f5756 100644
--- a/llama_api/server/pools/llama.py
+++ b/llama_api/server/pools/llama.py
@@ -70,10 +70,10 @@ def get_model_names() -> List[str]:
 
 def get_model(model_name: str) -> "BaseLLMModel":
     """Get a model from the model_definitions.py file"""
-    try:
+    with logger.log_any_error(
+        f"Error getting model: {model_name}", exc_info=None
+    ):
         return getattr(model_definitions, model_name)
-    except Exception:
-        raise AssertionError(f"Could not find a model: {model_name}")
 
 
 def get_completion_generator(
@@ -87,7 +87,9 @@ def get_completion_generator(
     If the model is not cached, create a new one.
     If the cache is full, delete the oldest completion generator."""
 
-    try:
+    with logger.log_any_error(
+        f"Error getting a completion generator of {body.model}"
+    ):
         # Check if the model is an OpenAI model
         openai_replacement_models: Dict[str, str] = getattr(
             model_definitions, "openai_replacement_models", {}
@@ -140,11 +142,6 @@ def get_completion_generator(
         # Add the new completion generator to the deque cache
         completion_generators.append(to_return)
         return to_return
-    except (AssertionError, OSError, MemoryError) as e:
-        raise e
-    except Exception as e:
-        logger.exception(f"Exception in get_completion_generator: {e}")
-        raise AssertionError(f"Could not find a model: {body.model}")
 
 
 def get_embedding_generator(
@@ -153,7 +150,10 @@ def get_embedding_generator(
     """Get an embedding generator for the given model.
     If the model is not cached, create a new one.
     If the cache is full, delete the oldest completion generator."""
-    try:
+
+    with logger.log_any_error(
+        f"Error getting a embedding generator of {body.model}"
+    ):
         body.model = body.model.lower()
         for embedding_generator in embedding_generators:
             if embedding_generator.model_name == body.model:
@@ -190,11 +190,6 @@ def get_embedding_generator(
         # Add the new completion generator to the deque cache
         embedding_generators.append(to_return)
         return to_return
-    except (AssertionError, OSError, MemoryError) as e:
-        raise e
-    except Exception as e:
-        logger.exception(f"Exception in get_embedding_generator: {e}")
-        raise AssertionError(f"Could not find a model: {body.model}")
 
 
 def generate_completion_chunks(
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 9a5ec15..ba0c49d 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -2,7 +2,7 @@
 Use same format as OpenAI API"""
 
 
-from asyncio import Task, create_task
+from asyncio import CancelledError, Task, create_task
 from contextlib import asynccontextmanager, contextmanager
 from dataclasses import dataclass, field
 from functools import partial
@@ -138,7 +138,7 @@ async def get_wix_with_semaphore(
     wix_meta = wix_metas[choice(candidates)]
     async with wix_meta.semaphore:
         if await request.is_disconnected():
-            raise get_cancelled_exc_class()()
+            raise CancelledError("Request is disconnected")
         wix_meta.processed_key = request_key
         yield wix_meta.wix
 
@@ -200,10 +200,10 @@ async def get_event_publisher(
                     if await request.is_disconnected():
                         raise get_cancelled_exc_class()()
                 await inner_send_chan.send(b"data: [DONE]\n\n")
-            except get_cancelled_exc_class() as e:
+            except get_cancelled_exc_class():
                 with move_on_after(1, shield=True):
                     task_status["interrupted"] = True
-                    raise e
+                    raise
 
 
 def get_streaming_iterator(
diff --git a/llama_api/utils/huggingface_downloader.py b/llama_api/utils/huggingface_downloader.py
index eedbf01..57fea87 100644
--- a/llama_api/utils/huggingface_downloader.py
+++ b/llama_api/utils/huggingface_downloader.py
@@ -84,10 +84,10 @@ def __init__(
             )
         except ValueError as err_branch:
             logger.error(err_branch)
-            raise err_branch
+            raise
         except HTTPError as err_http:
             logger.error(err_http)
-            raise err_http
+            raise
 
     @property
     def model(self) -> str:
diff --git a/llama_api/utils/llama_cpp.py b/llama_api/utils/llama_cpp.py
index fc16ef2..b1de480 100644
--- a/llama_api/utils/llama_cpp.py
+++ b/llama_api/utils/llama_cpp.py
@@ -1,9 +1,8 @@
 import shutil
 import subprocess
 import sys
-from contextlib import contextmanager
 from logging import Logger, getLogger
-from os import chdir, environ, getcwd
+from os import environ
 from pathlib import Path
 from typing import List, Optional, Union
 
@@ -54,17 +53,6 @@
 }
 
 
-@contextmanager
-def _temporary_change_cwd(path):
-    # Change the current working directory to `path` and then change it back
-    prev_cwd = getcwd()
-    chdir(path)
-    try:
-        yield
-    finally:
-        chdir(prev_cwd)
-
-
 def _git_clone_if_not_exists() -> None:
     # Clone the git repos if they don't exist
     for clone_path, clone_command in GIT_CLONES.items():
diff --git a/llama_api/utils/logger.py b/llama_api/utils/logger.py
index dbefbbb..093f005 100644
--- a/llama_api/utils/logger.py
+++ b/llama_api/utils/logger.py
@@ -1,9 +1,10 @@
 """Logger module for the API"""
-
+# flake8: noqa
+from contextlib import contextmanager
 import logging
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Callable, Dict, Generator, Optional, Union
 
 from .colorama import Fore, Style
 
@@ -82,7 +83,7 @@ def __init__(
         self.addHandler(console)
 
     @classmethod
-    def cinfo(cls, msg: str, *args, **kwargs) -> None:
+    def cinfo(cls, msg: object, *args, **kwargs) -> None:
         if cls.__name__ not in cls._instances:
             cls(cls.__name__)
         super(
@@ -91,7 +92,7 @@ def cinfo(cls, msg: str, *args, **kwargs) -> None:
         ).info(msg, *args, **kwargs)
 
     @classmethod
-    def cdebug(cls, msg: str, *args, **kwargs) -> None:
+    def cdebug(cls, msg: object, *args, **kwargs) -> None:
         if cls.__name__ not in cls._instances:
             cls(cls.__name__)
         super(ApiLogger, cls._instances[cls.__name__]).debug(
@@ -99,7 +100,7 @@ def cdebug(cls, msg: str, *args, **kwargs) -> None:
         )
 
     @classmethod
-    def cwarning(cls, msg: str, *args, **kwargs) -> None:
+    def cwarning(cls, msg: object, *args, **kwargs) -> None:
         if cls.__name__ not in cls._instances:
             cls(cls.__name__)
         super(ApiLogger, cls._instances[cls.__name__]).warning(
@@ -107,7 +108,7 @@ def cwarning(cls, msg: str, *args, **kwargs) -> None:
         )
 
     @classmethod
-    def cerror(cls, msg: str, *args, **kwargs) -> None:
+    def cerror(cls, msg: object, *args, **kwargs) -> None:
         if cls.__name__ not in cls._instances:
             cls(cls.__name__)
         super(ApiLogger, cls._instances[cls.__name__]).error(
@@ -115,7 +116,7 @@ def cerror(cls, msg: str, *args, **kwargs) -> None:
         )
 
     @classmethod
-    def cexception(cls, msg: str, *args, **kwargs) -> None:
+    def cexception(cls, msg: object, *args, **kwargs) -> None:
         if cls.__name__ not in cls._instances:
             cls(cls.__name__)
         super(ApiLogger, cls._instances[cls.__name__]).exception(
@@ -123,9 +124,56 @@ def cexception(cls, msg: str, *args, **kwargs) -> None:
         )
 
     @classmethod
-    def ccritical(cls, msg: str, *args, **kwargs) -> None:
+    def ccritical(cls, msg: object, *args, **kwargs) -> None:
         if cls.__name__ not in cls._instances:
             cls(cls.__name__)
         super(ApiLogger, cls._instances[cls.__name__]).critical(
             msg, *args, **kwargs
         )
+
+    @contextmanager
+    def log_any_error(
+        self,
+        msg: Optional[object] = None,
+        level: int = logging.ERROR,
+        exc_info: Optional[Union[bool, Exception]] = True,
+        suppress_exception: bool = False,
+        on_error: Optional[Callable[[Exception], None]] = None,
+        *args,
+        **kwargs,
+    ) -> Generator[None, None, None]:
+        """
+        A context manager to automatically log exceptions that occur within its context.
+
+        Args:
+            msg (Optional[object], default=None): An optional message to be prepended to the exception message in the log.
+            level (int, default=logging.ERROR): The logging level at which the exception should be logged. Default is ERROR.
+            exc_info (logging._ExcInfoType, default=True): If set to True, exception information will be added to the log. Otherwise, only the exception message will be logged.
+            suppress_exception (bool, default=False): If True, the exception will be suppressed (not re-raised). If False, the exception will be re-raised after logging.
+            on_error (Optional[Callable[[Exception], None]], default=None): A callback function that will be invoked with the exception as its argument if one occurs.
+            *args: Variable length argument list passed to the logging function.
+            **kwargs: Arbitrary keyword arguments passed to the logging function.
+
+        Usage:
+            with logger.log_any_error(msg="An error occurred", level=logging.WARNING, on_error=my_callback_function):
+                potentially_faulty_function()
+
+        Notes:
+            - If a custom message is provided using the 'msg' parameter, it will be prepended to the actual exception message in the log.
+            - If 'on_error' is provided, it will be executed with the caught exception as its argument. This can be used for custom handling or notification mechanisms.
+        """
+
+        try:
+            yield
+        except Exception as e:
+            self.log(
+                level,
+                f"{msg}: {e}" if msg else e,
+                *args,
+                **kwargs,
+                exc_info=exc_info,
+            )
+            if on_error:
+                on_error(e)
+            if not suppress_exception:
+                raise
diff --git a/llama_api/utils/path.py b/llama_api/utils/path.py
index 84f6f9d..27a51cd 100644
--- a/llama_api/utils/path.py
+++ b/llama_api/utils/path.py
@@ -154,7 +154,7 @@ def resolve_model_path_to_posix(
     model_path: str, default_relative_directory: Optional[str] = None
 ) -> str:
     """Resolve a model path to a POSIX path."""
-    try:
+    with logger.log_any_error("Error resolving model path"):
         path = Path(model_path)
         if path.is_absolute():
             # The path is already absolute
@@ -191,9 +191,6 @@ def resolve_model_path_to_posix(
             )
         # Try to resolve the model path from Huggingface
         return HuggingfaceResolver(model_path).resolve()
-    except Exception as e:
-        logger.error(f"Error resolving model path: {e}")
-        raise e
 
 
 def resolve_model_path_to_posix_with_cache(
@@ -230,11 +227,9 @@ def resolve_model_path_to_posix_with_cache(
                 cache[model_path] = resolved
 
                 # Update the cache file
-                try:
+                with logger.log_any_error("Error writing model path cache"):
                     with open(cache_file, "w") as f:
                         f.write(orjson.dumps(cache).decode())
-                except Exception as e:
-                    logger.error(f"Error writing model path cache: {e}")
             return resolved
     except (Timeout, TypeError) as e:
         logger.warning(

From 7da7b60ac33fdf153f1df5153da4f371bd0b4df6 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Wed, 16 Aug 2023 01:22:02 +0900
Subject: [PATCH 14/15] lora support for exllama

---
 llama_api/modules/exllama.py      |  42 ++++++--
 llama_api/modules/exllama_lora.py | 169 ++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+), 11 deletions(-)
 create mode 100644 llama_api/modules/exllama_lora.py

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index df8564f..bd868c3 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -7,14 +7,13 @@
 
 logger = ApiLogger(__name__)
 if environ.get("LLAMA_API_XFORMERS") == "1":
-    try:
+    with logger.log_any_error(
+        "xformers mode is enabled, but xformers is not installed",
+        suppress_exception=True,
+    ):
         from ..modules.xformers import hijack_attention_forward
 
         hijack_attention_forward()
-    except Exception as e:
-        logger.warning(
-            f"xformers mode is enabled, but xformers is not installed: {e}"
-        )
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -43,6 +42,7 @@
 from ..utils.dependency import import_repository
 from ..utils.system import deallocate_memory
 from .base import BaseCompletionGenerator
+from .exllama_lora import ExLlamaLora
 
 with import_repository(
     git_path="https://github.com/turboderp/exllama",
@@ -309,7 +309,7 @@ def _generate_text_with_streaming(
     prompt: str,
     settings: "TextGenerationSettings",
 ) -> Iterator[str]:
-    try:
+    with logger.log_any_error():
         # Make sure that the stop token is a list
         if isinstance(settings.stop, str):
             stops = [settings.stop]  # type: List[str]
@@ -321,6 +321,10 @@ def _generate_text_with_streaming(
         # Apply the settings to the generator
         generator = _apply_settings_to_generator(cg, settings=settings)
 
+        # Apply the LORA model
+        if cg.lora:
+            generator.lora = cg.lora  # type: ignore
+
         # Start the generator
         context_window = cg.llm_model.max_total_tokens
         if settings.guidance_scale == 1:
@@ -351,9 +355,6 @@ def _generate_text_with_streaming(
         yield from _generator(
             cg, settings=settings, cfg_mask=mask, stops=stops
         )
-    except Exception as e:
-        logger.exception(e)
-        raise e
 
 
 class ExllamaCompletionGenerator(BaseCompletionGenerator):
@@ -363,6 +364,7 @@ class ExllamaCompletionGenerator(BaseCompletionGenerator):
     _tokenizer: Optional[ExLlamaTokenizer] = None
     _generator: Optional[ExLlamaGenerator] = None
     _llm_model: Optional["ExllamaModel"] = None
+    _lora: Optional["ExLlamaLora"] = None
     _completion_status: Dict[
         str, int
     ] = {}  # key: completion_id, value: number of completion tokens
@@ -397,22 +399,40 @@ def config(self) -> ExLlamaConfig:
         assert self._config is not None, "Config is not initialized."
         return self._config
 
+    @property
+    def lora(self) -> Optional[ExLlamaLora]:
+        return self._lora
+
     @classmethod
     def from_pretrained(
         cls, llm_model: "ExllamaModel"
     ) -> "ExllamaCompletionGenerator":
-        result = cls()
         model_folder_path = Path(llm_model.model_path_resolved)
+        lora_path = model_folder_path / "adapter_model.bin"
+        lora_config_path = model_folder_path / "adapter_config.json"
+
+        result = cls()
+        result._llm_model = llm_model
         result._config = _make_config(model_folder_path, llm_model)
         result._tokenizer = ExLlamaTokenizer(
             (model_folder_path / "tokenizer.model").as_posix()
         )
         result._model = ExLlama(result._config)
+        if lora_path.exists() and lora_config_path.exists():
+            logger.info(f"🦙 LORA model found for {result.model_name}")
+            with logger.log_any_error(
+                f"🦙 LORA model loading failed for {result.model_name}"
+            ):
+                result._lora = ExLlamaLora(
+                    model=result._model,
+                    lora_config_path=lora_config_path.as_posix(),
+                    lora_path=lora_path.as_posix(),
+                )
+            logger.info(f"🦙 LORA model loaded for {result.model_name}")
         result._cache = ExLlamaCache(result._model)
         result._generator = ExLlamaGenerator(
             result._model, result._tokenizer, result._cache
         )
-        result._llm_model = llm_model
         return result
 
     def generate_completion_with_streaming(
diff --git a/llama_api/modules/exllama_lora.py b/llama_api/modules/exllama_lora.py
new file mode 100644
index 0000000..7f2c3c9
--- /dev/null
+++ b/llama_api/modules/exllama_lora.py
@@ -0,0 +1,169 @@
+# flake8: noqa
+from pathlib import Path
+from typing import Dict, Union
+from llama_api.utils.dependency import import_repository
+
+with import_repository(
+    git_path="https://github.com/turboderp/exllama",
+    disk_path="repositories/exllama",
+):
+    from repositories.exllama.model import ExLlama, Ex4bitLinear, ExLlamaConfig
+
+import json
+
+import torch
+from safetensors.torch import load_file as safe_load_file
+from torch import load as load_file
+
+
+class ExLlamaLora:
+    lora_config_path: str
+    lora_path: str
+    lora_r: int
+    lora_alpha: float
+    lora_scaling: float
+    config: ExLlamaConfig
+    tensors: Dict[str, torch.Tensor]
+    bias_ignored: bool
+
+    def __init__(
+        self,
+        model: ExLlama,
+        lora_config_path: Union[str, Path],
+        lora_path: Union[str, Path],
+    ):
+        self.lora_config_path = str(lora_config_path)
+        self.lora_path = str(lora_path)
+        self.model = model
+        self.config = model.config
+        self.tensors = {}
+        self.bias_ignored = False
+
+        # Grab relevant items from LoRA config
+        with open(lora_config_path) as f:
+            read_config = json.load(f)
+
+        self.lora_r = read_config["r"]
+        self.lora_alpha = float(read_config["lora_alpha"])
+        self.lora_scaling = self.lora_alpha / self.lora_r
+
+        if "fan_in_fan_out" in read_config and read_config["fan_in_fan_out"]:
+            raise ValueError(" ## Error: fan_in_fan_out mode not supported.")
+
+        # Load LoRA weights
+        if self.lora_path.endswith(".safetensors"):
+            f = safe_load_file(self.lora_path, device="cpu")
+        else:
+            f = load_file(self.lora_path, map_location="cpu")
+
+        for key in f.keys():
+            tensor = f[key]
+
+            # Find target module
+            i = key.find("model.layers.")
+            if i == -1:
+                raise ValueError(
+                    f" ## Error: unsupported layer in {self.lora_path}: {key}"
+                )
+
+            target_key = key[i:]
+            ks = target_key.split(".")
+            decoder_idx = int(ks[2])
+            decoder_part = ks[3]
+            decoder_layer = ks[4]
+            lora_half = ks[5]
+
+            if lora_half == "bias":
+                epsilon = 1e-6
+                if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon:
+                    raise ValueError(
+                        f" ## Error: unsupported bias target {self.lora_path}: {key}"
+                    )
+                self.bias_ignored = True
+                continue
+
+            target_module = self.model.layers[decoder_idx]
+            if decoder_part == "self_attn":
+                target_module = target_module.self_attn
+            elif decoder_part == "mlp":
+                target_module = target_module.mlp
+            else:
+                raise ValueError(
+                    f" ## Error: unsupported layer in {self.lora_path}: {key}"
+                )
+
+            if decoder_layer == "q_proj":
+                target_module = target_module.q_proj
+            elif decoder_layer == "k_proj":
+                target_module = target_module.k_proj
+            elif decoder_layer == "v_proj":
+                target_module = target_module.v_proj
+            elif decoder_layer == "o_proj":
+                target_module = target_module.o_proj
+            elif decoder_layer == "gate_proj":
+                target_module = target_module.gate_proj
+            elif decoder_layer == "up_proj":
+                target_module = target_module.up_proj
+            elif decoder_layer == "down_proj":
+                target_module = target_module.down_proj
+            else:
+                raise ValueError(
+                    f" ## Error: unsupported layer in {self.lora_path}: {key}"
+                )
+
+            # Check that shape is compatible
+            assert isinstance(
+                target_module, Ex4bitLinear
+            ), f"Target module {target_module} is not Ex4bitLinear, but {type(target_module)}"
+
+            if lora_half == "lora_A":
+                in_features = tensor.shape[1]
+                out_features = None
+            elif lora_half == "lora_B":
+                in_features = None
+                out_features = tensor.shape[0]
+            else:
+                raise ValueError(
+                    f" ## Error: unsupported layer in {self.lora_path}: {key}"
+                )
+
+            if (in_features and in_features != target_module.in_features) or (
+                out_features and out_features != target_module.out_features
+            ):
+                raise ValueError(
+                    f" ## Error: incompatible tensor shape in {self.lora_path}: {key}"
+                )
+
+            # For efficiency, transpose adapter instead of transposing state during inference
+
+            tensor = tensor.T.contiguous()
+
+            # Pre-scale
+
+            if lora_half == "lora_B" and self.lora_scaling != 1.0:
+                tensor.mul_(self.lora_scaling)
+
+            # Check that dtype is compatible, or convert
+
+            if tensor.dtype == torch.bfloat16:
+                tensor = tensor.to(torch.float16)
+
+            elif tensor.dtype == torch.float32:
+                tensor = tensor.to(torch.float16)
+
+            elif tensor.dtype == torch.float16:
+                pass
+
+            else:
+                raise ValueError(
+                    f" ## Error: unsupported tensor dtype in {self.lora_path}"
+                )
+
+            # Move to target device
+
+            device = self.config.device_map.map(target_key)
+            tensor = tensor.to(device, non_blocking=True)
+
+            # Store adapter tensor
+
+            self.tensors[target_key] = tensor

From 1f111ba1d938d89b97045904e1c373be90e1cf28 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Thu, 17 Aug 2023 00:04:50 +0900
Subject: [PATCH 15/15] update: docker image & readme

---
 .gitignore                     |  3 +-
 build_shared_lib.py            | 19 ++++++++-
 docker-compose.persistent.yml  |  2 +-
 docker-compose.yml             |  2 +-
 llama_api/server/routers/v1.py |  4 +-
 main.py                        |  4 +-
 readme.md                      | 73 +++++++++++++++++++---------------
 requirements-all.txt           | 26 ------------
 8 files changed, 67 insertions(+), 66 deletions(-)
 delete mode 100644 requirements-all.txt

diff --git a/.gitignore b/.gitignore
index 4836eb0..a038e86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ repositories/
 .vscode/
 .test-venv/
 .temp/
-PRIVATE_*
\ No newline at end of file
+PRIVATE_*
+private/*
\ No newline at end of file
diff --git a/build_shared_lib.py b/build_shared_lib.py
index 819f819..594403c 100644
--- a/build_shared_lib.py
+++ b/build_shared_lib.py
@@ -1,5 +1,6 @@
 # flake8: noqa
 
+from argparse import ArgumentParser
 from llama_api.utils.llama_cpp import (
     build_shared_lib,
     CPU_ARGS,  # Only use CPU
@@ -8,8 +9,24 @@
 )
 from os import environ
 
+ARGS = {
+    "CPU": CPU_ARGS,
+    "METAL": METAL_ARGS,
+    "CUBLAS": CUBLAS_ARGS,
+    "CUDA": CUBLAS_ARGS,
+}
 
 if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-b",
+        "--build_type",
+        type=lambda s: str(s).upper(),
+        default="CPU",
+        choices=["CPU", "METAL", "CUBLAS", "CUDA"],
+        help="Build type",
+    )
+
     environ["FORCE_CMAKE"] = "1"
-    environ["CMAKE_ARGS"] = CPU_ARGS  # EDIT THIS LINE TO CHANGE BUILD TYPE !!!
+    environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type]
     build_shared_lib()
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
index 08605d3..f018d07 100644
--- a/docker-compose.persistent.yml
+++ b/docker-compose.persistent.yml
@@ -5,7 +5,7 @@ volumes:
 
 services:
   llama-api:
-    image: cosogi/llama-api:230814
+    image: cosogi/llama-api:230816
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
diff --git a/docker-compose.yml b/docker-compose.yml
index a914dfa..3c910ea 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230814
+    image: cosogi/llama-api:230816
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index ba0c49d..b2aeb47 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -2,7 +2,7 @@
 Use same format as OpenAI API"""
 
 
-from asyncio import CancelledError, Task, create_task
+from asyncio import Task, create_task
 from contextlib import asynccontextmanager, contextmanager
 from dataclasses import dataclass, field
 from functools import partial
@@ -138,7 +138,7 @@ async def get_wix_with_semaphore(
     wix_meta = wix_metas[choice(candidates)]
     async with wix_meta.semaphore:
         if await request.is_disconnected():
-            raise CancelledError("Request is disconnected")
+            return
         wix_meta.processed_key = request_key
         yield wix_meta.wix
 
diff --git a/main.py b/main.py
index 7f8bd16..15877df 100644
--- a/main.py
+++ b/main.py
@@ -62,7 +62,7 @@
         help="Apply xformers' memory-efficient optimizations",
     )
     parser.add_argument(
-        "--disable-embeddings",
+        "--no-embed",
         action="store_true",
         help="Disable embeddings endpoint",
     )
@@ -80,6 +80,6 @@
             "LLAMA_API_XFORMERS": "1" if args.xformers else "",
             "LLAMA_API_API_KEY": args.api_key or "",
             "FORCE_CUDA": "1" if args.force_cuda else "",
-            "LLAMA_API_EMBEDDINGS": "1" if not args.disable_embeddings else "",
+            "LLAMA_API_EMBEDDINGS": "1" if not args.no_embed else "",
         },
     )
diff --git a/readme.md b/readme.md
index 7a3eadd..5a54926 100644
--- a/readme.md
+++ b/readme.md
@@ -3,11 +3,51 @@ This project aims to provide a simple way to run **LLama.cpp** and **Exllama** m
 
 You can use this server to run the models in your own application, or use it as a standalone API server!
 
+## Before you start
+
+1. **Python 3.8 / 3.9 / 3.10 / 3.11** is required to run the server. You can download it from https://www.python.org/downloads/
+
+2. **llama.cpp**: To use llama.cpp, and if you are **Windows** user, download [CMake](https://cmake.org/download/) to compile library.
+
+3. **ExLlama**: To use ExLlama, install the prerequisites of this [repository](https://github.com/turboderp/exllama). Maybe **Windows** user needs to install both [MSVC 2022](https://visualstudio.microsoft.com/downloads/) and [CUDA Toolkit 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive).
+
+
+
+## How to run server
+
+All required packages will be installed automatically with this command.
+
+```bash
+python -m main --install-pkgs
+```
+
+If you already have all required packages installed, you can skip the installation with this command.
+```bash
+python -m main
+```
+Options:
+```b
+usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-i] [-c] [--skip-torch-install] [--skip-tf-install] [--skip-compile] [-k API_KEY] [-x] [--no-embed]
+
+options:
+  -h, --help            show this help message and exit
+  -p PORT, --port PORT  Port to run the server on; default is 8000
+  -w MAX_WORKERS, --max-workers MAX_WORKERS
+                        Maximum number of process workers to run; default is 1
+  -i, --install-pkgs    Install all required packages before running the server
+  -c, --force-cuda      Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
+  --skip-torch-install  Skip installing pytorch, if `install-pkgs` is set
+  --skip-tf-install     Skip installing tensorflow, if `install-pkgs` is set
+  --skip-compile        Skip compiling the shared library of LLaMA C++ code
+  -k API_KEY, --api-key API_KEY
+                        API key to use for the server
+  -x, --xformers        Apply xformers' memory-efficient optimizations
+  --no-embed            Disable embeddings endpoint
+```
 
 ### Unique features
 
 1. **On-Demand Model Loading**
-   > **Caution:** There is a bug where VRAM does not get freed when unloading, if **cuBLAS** is used in **llama.cpp**. This issue has been reported for a while but it's still unresolved.
    - The project tries to load the model defined in `model_definitions.py` into the worker process when it is sent along with the request JSON body. The worker continually uses the cached model and when a request for a different model comes in, it unloads the existing model and loads the new one. 
 
 2. **Parallelism and Concurrency Enabled**
@@ -16,13 +56,6 @@ You can use this server to run the models in your own application, or use it as
 3. **Auto Dependency Installation**
    - The project automatically do git clones and installs the required dependencies, including **pytorch** and **tensorflow**, when the server is started. This is done by checking the `pyproject.toml` or `requirements.txt` file in the root directory of this project or other repositories. `pyproject.toml` will be parsed into `requirements.txt` with `poetry`. If you want to add more dependencies, simply add them to the file.
 
-## Before you start
-
-1. **Python 3.8 / 3.9 / 3.10 / 3.11** is required to run the server. You can download it from https://www.python.org/downloads/
-
-2. **llama.cpp**: To use llama.cpp, and if you are **Windows** user, download [CMake](https://cmake.org/download/) to compile library.
-
-3. **ExLlama**: To use ExLlama, install the prerequisites of this [repository](https://github.com/turboderp/exllama). Maybe **Windows** user needs to install both [MSVC 2022](https://visualstudio.microsoft.com/downloads/) and [CUDA Toolkit 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive).
 
 ## How to download the models
 
@@ -62,31 +95,7 @@ The path of the model has to be the folder name. Let's say, **orca_mini_7b**, wh
 ## Where to define the models
 Define llama.cpp & exllama models in `model_definitions.py`. You can define all necessary parameters to load the models there. Refer to the example in the file.
 
-## How to run server
-
-All required packages will be installed automatically with this command.
-
-```bash
-python -m main --install-pkgs
-```
 
-If you already have all required packages installed, you can skip the installation with this command.
-```bash
-python -m main
-```
-Options:
-```b
-  -h, --help            show this help message and exit
-  -p PORT, --port PORT  Port to run the server on; default is 8000
-  -w MAX_WORKERS, --max-workers MAX_WORKERS
-                        Maximum number of process workers to run; default is 1
-  --install-pkgs        Install all required packages before running the server
-  --force-cuda          Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
-  --skip-torch-install  Skip installing pytorch, if `install-pkgs` is set
-  --skip-tf-install     Skip installing tensorflow, if `install-pkgs` is set
-  -k API_KEY, --api-key API_KEY
-                        API key to use for the server
-```
 
 ## Usage: Text Completion
 Now, you can send a request to the server.
diff --git a/requirements-all.txt b/requirements-all.txt
deleted file mode 100644
index 8e1425d..0000000
--- a/requirements-all.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-psutil
-fastapi
-uvicorn[standard]
-transformers
-orjson
-llama_cpp_python[server]
-safetensors==0.3.1
-sentencepiece>=0.1.97
-ninja==1.11.1
---find-links https://download.pytorch.org/whl/torch_stable.html
-torch==2.0.1+cu118
-numpy
-scikit-learn
-tensorflow>=2.0.0
-tensorflow-hub
-scikit-build
-
-
-# Dev
-black
-twine
-flake8
-mkdocs
-mkdocstrings
-mkdocs-material
-httpx
\ No newline at end of file