From 668faeb733dd0fae5093a5a5f7ba16ef6eba87e5 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 13 Aug 2023 01:07:22 +0900 Subject: [PATCH 01/15] Added logit processors --- llama_api/logits/base.py | 19 ++ llama_api/logits/bias.py | 56 ++++ llama_api/logits/muse.py | 78 +++++ llama_api/mixins/logits.py | 34 ++ llama_api/mixins/prompt_utils.py | 21 +- llama_api/modules/base.py | 25 +- llama_api/modules/exllama.py | 550 +++++++++++++++++++++---------- llama_api/modules/llama_cpp.py | 103 ++---- llama_api/schemas/api.py | 34 +- llama_api/schemas/models.py | 8 + llama_api/server/pools/llama.py | 18 +- llama_api/utils/errors.py | 4 +- llama_api/utils/process_pool.py | 11 + llama_api/utils/system.py | 2 +- 14 files changed, 669 insertions(+), 294 deletions(-) create mode 100644 llama_api/logits/base.py create mode 100644 llama_api/logits/bias.py create mode 100644 llama_api/logits/muse.py create mode 100644 llama_api/mixins/logits.py diff --git a/llama_api/logits/base.py b/llama_api/logits/base.py new file mode 100644 index 0000000..f7449a7 --- /dev/null +++ b/llama_api/logits/base.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List + +if TYPE_CHECKING: + import torch as pytorch + + +class BaseLogitProcessor(ABC): + @abstractmethod + def with_torch( + self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor" + ) -> "pytorch.Tensor": + """Process logits with PyTorch tensors.""" + + @abstractmethod + def without_torch( + self, input_ids: List[int], scores: List[float] + ) -> List[float]: + """Process logits with Python lists.""" diff --git a/llama_api/logits/bias.py b/llama_api/logits/bias.py new file mode 100644 index 0000000..ebdae0f --- /dev/null +++ b/llama_api/logits/bias.py @@ -0,0 +1,56 @@ +from typing import TYPE_CHECKING, Callable, Dict, List, Literal, Optional + +from .base import BaseLogitProcessor + +if TYPE_CHECKING: + import torch as pytorch + + +class LogitBiasProcessor(BaseLogitProcessor): + """Create a logit bias processor to bias the logit scores.""" + + def __init__( + self, + logit_bias: Dict[str, float], + logit_bias_type: Optional[Literal["input_ids", "tokens"]], + encoder: Callable[[str], List[int]], + ): + if logit_bias_type is None: + logit_bias_type = "input_ids" + + to_bias = {} # type: Dict[int, float] + if logit_bias_type == "input_ids": + for input_id_string, score in logit_bias.items(): + to_bias[int(input_id_string)] = score + + elif logit_bias_type == "tokens": + for token, score in logit_bias.items(): + for input_id in encoder(token): + to_bias[input_id] = score + + self._to_bias = to_bias + self._bias_tensor = None + + def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor": + if self._bias_tensor is None: + import torch + + self._bias_tensor = torch.zeros( + scores.shape[-1], dtype=scores.dtype, device=scores.device + ) + for idx, value in self._to_bias.items(): + self._bias_tensor[idx] = value + + return self._bias_tensor + + def with_torch( + self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor" + ) -> "pytorch.Tensor": + return scores + self._get_bias_tensor(scores) + + def without_torch( + self, input_ids: List[int], scores: List[float] + ) -> List[float]: + for id, biased_score in self._to_bias.items(): + scores[id] += biased_score + return scores diff --git a/llama_api/logits/muse.py b/llama_api/logits/muse.py new file mode 100644 index 0000000..8c0414c --- /dev/null +++ b/llama_api/logits/muse.py @@ -0,0 +1,78 @@ +# flake8: noqa +from typing import TYPE_CHECKING, List, Tuple + +from .base import BaseLogitProcessor + +if TYPE_CHECKING: + import torch as pytorch + + +class MuseLogitProcessor(BaseLogitProcessor): + """Performs dampening of the k highest probability elements. + + Args: + top_k (`int`): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + damp (`float`, *optional*, defaults to 0.98): + How much less likely should the top_k most likely tokens be made. If set to 0, they become impossible. + """ + + def __init__( + self, + top_k: int = 3, + damp: float = 0.9, + damp_initial: float = 1.0, + damp_ramp_tokens: int = 32, + min_tokens_to_keep: int = 1, + ): + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError( + "`top_k` has to be a strictly positive integer, " + f"but is {top_k}" + ) + + self.top_k = max(top_k, min_tokens_to_keep) + self.damp = damp + self.damp_initial = damp_initial + self.damp_ramp_tokens = damp_ramp_tokens + self.token_num = 0 + + def with_torch( + self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor" + ) -> "pytorch.Tensor": + import torch + + top_k_safety = min(self.top_k, scores.size(-1)) # Safety check + linear_damp = self.linear_damp + topk_values, topk_indices = torch.topk( + scores, top_k_safety, dim=-1 + ) # Specify the dimension + self.token_num += 1 + return scores.scatter_(-1, topk_indices, topk_values * linear_damp) + + def without_torch( + self, input_ids: List[int], scores: List[float] + ) -> List[float]: + top_k_safety = min(self.top_k, len(scores)) # Safety check + linear_damp = self.linear_damp + topk_values_indices = sorted( + range(len(scores)), key=lambda x: scores[x], reverse=True + )[:top_k_safety] + self.token_num += 1 + return [ + score * linear_damp if idx in topk_values_indices else score + for idx, score in enumerate(scores) + ] + + @property + def linear_damp(self) -> float: + ratio = ( + 1.0 + if self.damp_ramp_tokens == 0 + else min(self.token_num / self.damp_ramp_tokens, 1.0) + ) + return ( + self.damp_initial + ratio * (self.damp - self.damp_initial) + if ratio < 1.0 + else self.damp + ) diff --git a/llama_api/mixins/logits.py b/llama_api/mixins/logits.py new file mode 100644 index 0000000..75867a1 --- /dev/null +++ b/llama_api/mixins/logits.py @@ -0,0 +1,34 @@ +from typing import Callable, List + +from ..logits.base import BaseLogitProcessor +from ..logits.bias import LogitBiasProcessor +from ..logits.muse import MuseLogitProcessor +from ..schemas.api import TextGenerationSettings + + +class LogitsMixin: + @staticmethod + def get_logit_processors( + settings: TextGenerationSettings, encoder: Callable[[str], List[int]] + ) -> List[BaseLogitProcessor]: + logit_processors: List[BaseLogitProcessor] = [] + if settings.muse: + logit_processors.append( + MuseLogitProcessor( + top_k=3, + damp=0.9, + damp_initial=1.0, + damp_ramp_tokens=32, + min_tokens_to_keep=1, + ) + ) + if settings.logit_bias is not None: + logit_processors.insert( + 0, + LogitBiasProcessor( + logit_bias=settings.logit_bias, + logit_bias_type=settings.logit_bias_type, + encoder=encoder, + ), + ) + return logit_processors diff --git a/llama_api/mixins/prompt_utils.py b/llama_api/mixins/prompt_utils.py index adc6194..0b19dec 100644 --- a/llama_api/mixins/prompt_utils.py +++ b/llama_api/mixins/prompt_utils.py @@ -61,18 +61,23 @@ def convert_messages_into_prompt( return chat_history + f"### {ai_input_role}:" @staticmethod - def is_possible_to_generate_stops( - decoded_text: str, stops: List[str] - ) -> bool: + def is_possible_to_generate_stops(text: str, stops: List[str]) -> bool: """A helper method to check if the decoded text contains any of the stop tokens.""" for stop in stops: - if stop in decoded_text or any( - [ - decoded_text.endswith(stop[: i + 1]) - for i in range(len(stop)) - ] + if stop in text or any( + [text.endswith(stop[: i + 1]) for i in range(len(stop))] ): return True return False + + @staticmethod + def raise_for_token_limit(prompt_tokens: int, context_window: int) -> None: + """A helper method to raise an error if the number of tokens + requested for completion exceeds the context window.""" + if prompt_tokens >= context_window: + raise ValueError( + f"Requested tokens ({prompt_tokens}) exceed " + f"context window of {context_window}" + ) diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py index fffc588..6bd6286 100644 --- a/llama_api/modules/base.py +++ b/llama_api/modules/base.py @@ -2,8 +2,10 @@ from dataclasses import dataclass from typing import Any, Iterator, List, TypeVar -from ..mixins.prompt_utils import PromptUtilsMixin +from llama_api.mixins.logits import LogitsMixin + from ..mixins.interrupt import InterruptMixin +from ..mixins.prompt_utils import PromptUtilsMixin from ..schemas.api import ( APIChatMessage, ChatCompletion, @@ -24,7 +26,9 @@ class BaseLLMModel: max_total_tokens: int = 2048 -class BaseCompletionGenerator(ABC, PromptUtilsMixin, InterruptMixin): +class BaseCompletionGenerator( + ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin +): """Base class for all completion generators.""" @abstractmethod @@ -38,14 +42,12 @@ def from_pretrained( cls, llm_model: "BaseLLMModel" ) -> "BaseCompletionGenerator": """Load a pretrained model into RAM.""" - ... @abstractmethod def generate_completion( self, prompt: str, settings: TextGenerationSettings ) -> Completion: """Generate a completion for a given prompt.""" - ... @abstractmethod def generate_completion_with_streaming( @@ -53,14 +55,12 @@ def generate_completion_with_streaming( ) -> Iterator[CompletionChunk]: """Generate a completion for a given prompt, yielding chunks of text as they are generated.""" - ... @abstractmethod def generate_chat_completion( self, messages: List[APIChatMessage], settings: TextGenerationSettings ) -> ChatCompletion: """Generate a completion for a given prompt.""" - ... @abstractmethod def generate_chat_completion_with_streaming( @@ -68,20 +68,25 @@ def generate_chat_completion_with_streaming( ) -> Iterator[ChatCompletionChunk]: """Generate a completion for a given prompt, yielding chunks of text as they are generated.""" - ... + + @abstractmethod + def encode(self, text: str, **kwargs: Any) -> List[int]: + """Encode a text string into a list of token IDs.""" + + @abstractmethod + def decode(self, ids: List[int], **kwargs: Any) -> str: + """Decode a list of token IDs into a text string.""" @property @abstractmethod def llm_model(self) -> "BaseLLMModel": """The LLM model used by this generator.""" - ... class BaseEmbeddingGenerator(ABC): @abstractmethod def __del__(self): """Clean up resources.""" - ... @classmethod @abstractmethod @@ -96,10 +101,8 @@ def generate_embeddings( **kwargs: Any, ) -> List[List[float]]: """Generate embeddings for a list of texts.""" - ... @property @abstractmethod def model_name(self) -> str: """Identifier for the model used by this generator.""" - ... diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index 2125bf8..7fb0a0f 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -1,10 +1,21 @@ """Wrapper for exllama to generate text completions.""" -from contextlib import contextmanager from pathlib import Path -from typing import TYPE_CHECKING, Dict, Iterator, List, Optional +from typing import ( + TYPE_CHECKING, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Union, + overload, +) -from torch import IntTensor, Tensor, cuda +from torch import IntTensor, Tensor, cuda, version +from torch.nn.functional import log_softmax +from ..logits.base import BaseLogitProcessor from ..schemas.models import ExllamaModel from ..utils.completions import ( make_chat_completion, @@ -15,6 +26,7 @@ from ..utils.dependency import import_repository from ..utils.logger import ApiLogger from ..utils.path import resolve_model_path_to_posix +from ..utils.system import deallocate_memory from .base import BaseCompletionGenerator with import_repository( @@ -35,28 +47,32 @@ TextGenerationSettings, ) -logger = ApiLogger("||🦙 exllama.generator||") assert cuda.is_available(), "CUDA must be available to use ExLlama." +logger = ApiLogger(__name__) +_stop_checker = BaseCompletionGenerator.is_possible_to_generate_stops -def _encode(tokenizer: ExLlamaTokenizer, text: str) -> Tensor: - """Encode a text string into a tensor.""" - result = tokenizer.encode(text) - if isinstance(result, tuple): - return result[0] - else: - return result +def _make_config( + model_folder_path: Path, llm_model: "ExllamaModel" +) -> ExLlamaConfig: + """Create a config object for the ExLlama model.""" + # Find the model checkpoint + model_file_found: List[Path] = [] + for ext in (".safetensors", ".pt", ".bin"): + model_file_found.extend(model_folder_path.glob(f"*{ext}")) + if model_file_found: + if len(model_file_found) > 1: + logger.warning( + f"More than one {ext} model has been found. " + "The last one will be selected. It could be wrong." + ) -def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig: - """Create a config object for the ExLlama model.""" - model_folder_path = Path( - resolve_model_path_to_posix( - llm_model.model_path, - default_relative_directory="models/gptq", - ), - ) - config = ExLlamaConfig((model_folder_path / "config.json").as_posix()) + break + if not model_file_found: + raise FileNotFoundError( + f"No model has been found in {model_folder_path}." + ) # Find the model checkpoint model_file_found: List[Path] = [] @@ -74,6 +90,8 @@ def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig: raise FileNotFoundError( f"No model has been found in {model_folder_path}." ) + + config = ExLlamaConfig((model_folder_path / "config.json").as_posix()) config.model_path = model_file_found[-1].as_posix() # type: ignore config.max_seq_len = llm_model.max_total_tokens config.max_input_len = llm_model.max_total_tokens @@ -91,184 +109,299 @@ def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig: config.matmul_fused_remap = llm_model.matmul_fused_remap config.silu_no_half2 = llm_model.silu_no_half2 config.concurrent_streams = llm_model.concurrent_streams + if llm_model.alpha_value is not None: + config.alpha_value = llm_model.alpha_value + config.calculate_rotary_embedding_base() + if version.hip: + config.rmsnorm_no_half2 = True + config.rope_no_half2 = True + config.matmul_no_half2 = True + config.silu_no_half2 = True return config -def _make_tokenizer(llm_model: "ExllamaModel") -> ExLlamaTokenizer: - """Create a tokenizer object for the ExLlama model.""" - model_folder_path = Path( - resolve_model_path_to_posix( - llm_model.model_path, - default_relative_directory="models/gptq", - ), +def _apply_settings_to_generator( + cg: "ExllamaCompletionGenerator", + settings: "TextGenerationSettings", +) -> ExLlamaGenerator: + """Apply the settings to the generator.""" + # Make sure that the batch size is correct + required_batch_size = 1 if settings.guidance_scale == 1 else 2 + cache_batch_size = cg.cache.batch_size # type: int + if cache_batch_size != required_batch_size: + cg._cache = None + deallocate_memory(cg._cache) + cg._cache = ExLlamaCache(cg._model, batch_size=required_batch_size) + cg._generator = ExLlamaGenerator( + model=cg._model, tokenizer=cg._tokenizer, cache=cg._cache + ) + # Temperature cannot be 0.0, so we use a very small value instead. + # 0.0 will cause a division by zero error. + generator = cg.generator + generator.settings.temperature = settings.temperature or 0.01 + generator.settings.top_p = settings.top_p + generator.settings.top_k = settings.top_k + generator.settings.typical = settings.typical_p + generator.settings.token_repetition_penalty_max = settings.repeat_penalty + generator.settings.token_repetition_penalty_sustain = ( + -1 + if settings.repetition_penalty_range <= 0 + else settings.repetition_penalty_range + ) + disallowed_tokens = ( + [generator.tokenizer.eos_token_id] if settings.ban_eos_token else None ) - return ExLlamaTokenizer( - (model_folder_path / "tokenizer.model").as_posix(), + generator.disallow_tokens(disallowed_tokens) + return generator + + +def _gen_single_token_with_cfg( + generator: ExLlamaGenerator, mask: Tensor, cfg_alpha: float +) -> int: + logits = generator.model.forward( + generator.sequence[:, -1:], cache=generator.cache, input_mask=mask + ) # type: Tensor # type: ignore + generator.apply_rep_penalty(logits) + probs = log_softmax(logits, dim=-1) + token, _ = generator.sample_current( + cfg_alpha * probs[0] + (1 - cfg_alpha) * probs[1] ) + generator.gen_accept_token(token.repeat(2, 1)) + return int(token.item()) + + +def _gen_single_token_without_cfg( + generator: ExLlamaGenerator, + initial_len: int, + constraints: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + logit_processors: Optional[Iterable[BaseLogitProcessor]] = None, +) -> int: + generator.end_beam_search() + + # Simple sampling case: + if generator.sequence is not None: + logits = generator.model.forward( + generator.sequence[:, -1:], + generator.cache, + lora=generator.lora, + input_mask=mask, + ) # type: Tensor # type: ignore + generator.apply_rep_penalty(logits) + logits[:, :, generator.tokenizer.bos_token_id] = -10000.0 + + if logit_processors is not None: + input_ids = generator.sequence[0][initial_len:] + for logit_processor in logit_processors: + logits = logit_processor.with_torch(input_ids, logits) + + if constraints is not None: + for constraint in constraints: + logits[:, :, constraint] += 10000.0 + logits[:, :, :] -= 10000.0 + + token, _ = generator.batched_sample( + logits, + generator.settings.temperature, + generator.settings.top_k, + generator.settings.top_p, + generator.settings.min_p + 0.01 + if constraints is not None + else 0.0, + generator.settings.typical, + ) + + else: + if constraints is not None: + token = constraints[0] + else: + token = Tensor([[generator.tokenizer.bos_token_id]]).long() + + generator.gen_accept_token(token) + return int(token.item()) + + +def _generator( + cg: "ExllamaCompletionGenerator", + settings: "TextGenerationSettings", + stops: List[str], + cfg_mask: Optional[Tensor] = None, +) -> Iterator[str]: + IdToPiece = cg.tokenizer.tokenizer.IdToPiece + decoder = cg.tokenizer.decode + generator = cg.generator + + cfg_alpha = settings.guidance_scale # type: float + initial_len = generator.sequence[0].shape[0] # type: int + eos_token_id = generator.tokenizer.eos_token_id # type: int + has_leading_space = False # type: bool + text_cursor = 0 # type: int + n_tokens = 0 # type: int + logit_processors = ( + [ + processor + for processor in BaseCompletionGenerator.get_logit_processors( + settings=settings, + encoder=cg.encode, + ) + ] + if cfg_mask is None + else None + ) # type: Optional[Iterable[BaseLogitProcessor]] + for n_tokens in range(1, settings.max_tokens + 1): + if cg.is_interrupted: + break # the generator was interrupted + + # Predict the next token id + if cfg_mask is not None: + token_id = _gen_single_token_with_cfg( + generator, mask=cfg_mask, cfg_alpha=cfg_alpha + ) + else: + token_id = _gen_single_token_without_cfg( + generator, + initial_len=initial_len, + logit_processors=logit_processors or None, + ) + if cg.is_interrupted or token_id == eos_token_id: + break + + # Yield the text piece + if n_tokens == 1: + has_leading_space = IdToPiece(token_id).startswith("▁") + decoded_text = ( + " " + str(decoder(generator.sequence[0][initial_len:])) + if has_leading_space + else str(decoder(generator.sequence[0][initial_len:])) + ) + text_piece = decoded_text[text_cursor:] + if "�" in text_piece: # Decode error when decoding multi-byte char + continue + if _stop_checker(text_piece, stops=stops): # Stop token found maybe + if any(stop in decoded_text for stop in stops): + break # Stop token found + continue + yield text_piece + text_cursor += len(text_piece) + # End of generation + cg._completion_status[settings.completion_id] = n_tokens + + +def _generate_text_with_streaming( + cg: "ExllamaCompletionGenerator", + prompt: str, + settings: "TextGenerationSettings", +) -> Iterator[str]: + try: + # Make sure that the stop token is a list + if isinstance(settings.stop, str): + stops = [settings.stop] # type: List[str] + elif isinstance(settings.stop, list): + stops = settings.stop + else: + stops = [] + + # Apply the settings to the generator + generator = _apply_settings_to_generator(cg, settings=settings) + + # Start the generator + if settings.guidance_scale == 1: + ids = _encode(cg.tokenizer, prompt) + mask = None # type: Optional[Tensor] + generator.end_beam_search() + generator.gen_begin_reuse(ids) + else: + ids, mask = _encode( + cg.tokenizer, + [prompt, settings.negative_prompt or ""], + return_mask=True, + ) + generator.gen_begin(ids, mask=mask) + cg.raise_for_token_limit( + prompt_tokens=ids.shape[-1], + context_window=cg.llm_model.max_total_tokens, + ) + yield from _generator( + cg, cfg_mask=mask, settings=settings, stops=stops + ) + except Exception as e: + logger.exception(e) + raise e class ExllamaCompletionGenerator(BaseCompletionGenerator): - config: Optional[ExLlamaConfig] = None - model: Optional[ExLlama] = None - cache: Optional[ExLlamaCache] = None - tokenizer: Optional[ExLlamaTokenizer] = None - generator: Optional[ExLlamaGenerator] = None + _config: Optional[ExLlamaConfig] = None + _model: Optional[ExLlama] = None + _cache: Optional[ExLlamaCache] = None + _tokenizer: Optional[ExLlamaTokenizer] = None + _generator: Optional[ExLlamaGenerator] = None _llm_model: Optional["ExllamaModel"] = None _completion_status: Dict[ str, int ] = {} # key: completion_id, value: number of completion tokens - def __del__(self) -> None: - if self.model is not None: - self.model.free_unmanaged() - del self.model - self.model = None - logger.info("🗑️ ExllamaCompletionGenerator model deleted") - if self.tokenizer is not None: - getattr(self.tokenizer, "__del__", lambda: None)() - del self.tokenizer - self.tokenizer = None - logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted") - if self.cache is not None: - getattr(self.cache, "__del__", lambda: None)() - del self.cache - self.cache = None - logger.info("🗑️ ExllamaCompletionGenerator cache deleted") - @property def llm_model(self) -> "ExllamaModel": assert self._llm_model is not None return self._llm_model + @property + def generator(self) -> ExLlamaGenerator: + assert self._generator is not None, "Generator is not initialized." + return self._generator + + @property + def tokenizer(self) -> ExLlamaTokenizer: + assert self._tokenizer is not None, "Tokenizer is not initialized." + return self._tokenizer + + @property + def cache(self) -> ExLlamaCache: + assert self._cache is not None, "Cache is not initialized." + return self._cache + + @property + def model(self) -> ExLlama: + assert self._model is not None, "Model is not initialized." + return self._model + + @property + def config(self) -> ExLlamaConfig: + assert self._config is not None, "Config is not initialized." + return self._config + @classmethod def from_pretrained( cls, llm_model: "ExllamaModel" ) -> "ExllamaCompletionGenerator": result = cls() - result.config = _make_config(llm_model) - result.tokenizer = _make_tokenizer(llm_model) - result.model = ExLlama(result.config) - result.cache = ExLlamaCache(result.model) - result.generator = None - result._llm_model = llm_model - return result - - @contextmanager - def _generator_context_manager( - self, prompt: str, settings: "TextGenerationSettings" - ) -> Iterator[ExLlamaGenerator]: - """Make a generator object for the ExLlama model.""" - assert self.model is not None, "Model is not initialized." - assert self.tokenizer is not None, "Tokenizer is not initialized." - assert self.cache is not None, "Cache is not initialized." - - generator = ExLlamaGenerator( - model=self.model, - tokenizer=self.tokenizer, - cache=self.cache, + model_folder_path = Path( + resolve_model_path_to_posix( + llm_model.model_path, + default_relative_directory="models/gptq", + ) ) - # Temperature cannot be 0.0, so we use a very small value instead. - # 0.0 will cause a division by zero error. - generator.settings.temperature = settings.temperature or 0.01 - generator.settings.top_p = settings.top_p - generator.settings.top_k = settings.top_k - generator.settings.typical = settings.typical_p - generator.settings.token_repetition_penalty_max = ( - settings.repeat_penalty + result._config = _make_config(model_folder_path, llm_model) + result._tokenizer = ExLlamaTokenizer( + (model_folder_path / "tokenizer.model").as_posix() ) - if ( - settings.ban_eos_token - and generator.tokenizer.eos_token_id is not None - ): - generator.disallow_tokens([generator.tokenizer.eos_token_id]) - - generator.end_beam_search() - generator.gen_begin_reuse(generator.tokenizer.encode(prompt)) - yield generator - del generator - - def _generate_text( - self, prompt: str, settings: "TextGenerationSettings" - ) -> str: - return "".join( - self._generate_text_with_streaming(prompt, settings=settings) - ) - - def _generate_text_with_streaming( - self, prompt: str, settings: "TextGenerationSettings" - ) -> Iterator[str]: - assert ( - self.model is not None - and self.tokenizer is not None - and self.cache is not None + result._model = ExLlama(result._config) + result._cache = ExLlamaCache(result._model) + result._generator = ExLlamaGenerator( + result._model, result._tokenizer, result._cache ) - - # Make sure that the stop token is a list - if isinstance(settings.stop, str): - stops = [settings.stop] - elif isinstance(settings.stop, list): - stops = settings.stop - else: - stops = [] - - with self._generator_context_manager( - prompt, settings=settings - ) as generator: - # Start generation - initial_len = generator.sequence[0].shape[0] - has_leading_space: bool = False - text_cursor: int = 0 - n_completion_tokens: int = 0 - - for n_completion_tokens in range(1, settings.max_tokens + 1): - if self.is_interrupted: - return # the generator was interrupted - token = generator.gen_single_token() - if self.is_interrupted: - return # the generator was interrupted - if token.item() == generator.tokenizer.eos_token_id: - return - if ( - n_completion_tokens == 0 - and generator.tokenizer.tokenizer.IdToPiece( - int(token) - ).startswith("▁") - ): - has_leading_space = True - - decoded_text = str( - generator.tokenizer.decode( - generator.sequence[0][initial_len:] - ) - ) - if has_leading_space: - decoded_text = " " + decoded_text - if self.is_possible_to_generate_stops( - decoded_text, stops=stops - ): - for stop in stops: - if stop in decoded_text: - return - continue - text_piece = decoded_text[text_cursor:] - if "�" in text_piece: - continue - yield text_piece - text_cursor += len(text_piece) - self._completion_status[ - settings.completion_id - ] = n_completion_tokens + result._llm_model = llm_model + return result def generate_completion_with_streaming( self, prompt: str, settings: "TextGenerationSettings" ) -> Iterator["CompletionChunk"]: - assert self.config is not None and self.tokenizer is not None completion_id: str = settings.completion_id model_path: str = str(self.config.model_path) last_token: Optional[str] = None generated_text: str = "" - for token in self._generate_text_with_streaming( - prompt, settings=settings + for token in _generate_text_with_streaming( + self, prompt=prompt, settings=settings ): generated_text += token if last_token is not None: @@ -285,7 +418,8 @@ def generate_completion_with_streaming( text=last_token if last_token is not None else "", finish_reason="length" if self._completion_status.get( - completion_id, _encode(self.tokenizer, generated_text).shape[1] + completion_id, + _encode(self.tokenizer, generated_text).shape[1], ) >= settings.max_tokens else "stop", @@ -294,9 +428,12 @@ def generate_completion_with_streaming( def generate_completion( self, prompt: str, settings: "TextGenerationSettings" ) -> "Completion": - assert self.tokenizer is not None and self.config is not None completion_id: str = settings.completion_id - generated_text: str = self._generate_text(prompt, settings=settings) + generated_text: str = "".join( + _generate_text_with_streaming( + self, prompt=prompt, settings=settings + ) + ) n_prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1] n_completion_tokens: int = self._completion_status.get( completion_id, _encode(self.tokenizer, generated_text).shape[1] @@ -317,14 +454,13 @@ def generate_chat_completion_with_streaming( messages: List["APIChatMessage"], settings: "TextGenerationSettings", ) -> Iterator["ChatCompletionChunk"]: - assert self.config is not None and self.tokenizer is not None completion_id: str = settings.completion_id prompt = self.convert_messages_into_prompt(messages, settings=settings) model_path: str = str(self.config.model_path) last_token: Optional[str] = None generated_text: str = "" - for token in self._generate_text_with_streaming( - prompt, settings=settings + for token in _generate_text_with_streaming( + self, prompt=prompt, settings=settings ): generated_text += token if last_token is not None: @@ -341,7 +477,8 @@ def generate_chat_completion_with_streaming( content=last_token if last_token is not None else "", finish_reason="length" if self._completion_status.get( - completion_id, _encode(self.tokenizer, generated_text).shape[1] + completion_id, + _encode(self.tokenizer, generated_text).shape[1], ) else "stop", ) @@ -351,10 +488,13 @@ def generate_chat_completion( messages: List["APIChatMessage"], settings: "TextGenerationSettings", ) -> "ChatCompletion": - assert self.tokenizer is not None and self.config is not None completion_id: str = settings.completion_id prompt = self.convert_messages_into_prompt(messages, settings=settings) - generated_text: str = self._generate_text(prompt, settings=settings) + generated_text: str = "".join( + _generate_text_with_streaming( + self, prompt=prompt, settings=settings + ) + ) prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1] completion_tokens: int = self._completion_status.get( completion_id, _encode(self.tokenizer, generated_text).shape[1] @@ -370,10 +510,62 @@ def generate_chat_completion( else "stop", ) - def encode(self, message: str, /) -> List[int]: - assert self.tokenizer is not None, "Tokenizer is not initialized" - return _encode(self.tokenizer, message).flatten().tolist() + def encode(self, text: str) -> List[int]: + assert self._tokenizer is not None, "Tokenizer is not initialized" + return _encode(self._tokenizer, text).flatten().tolist() - def decode(self, tokens: List[int], /) -> str: - assert self.tokenizer is not None, "Tokenizer is not initialized" - return str(self.tokenizer.decode(IntTensor(tokens))) + def decode(self, ids: List[int], **kwargs) -> str: + assert self._tokenizer is not None, "Tokenizer is not initialized" + return str(self._tokenizer.decode(IntTensor(ids))) + + def __del__(self) -> None: + if self._model is not None: + self._model.free_unmanaged() + del self._model + self._model = None + logger.info("🗑️ ExllamaCompletionGenerator model deleted") + if self._tokenizer is not None: + getattr(self._tokenizer, "__del__", lambda: None)() + del self._tokenizer + self._tokenizer = None + logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted") + if self._cache is not None: + getattr(self._cache, "__del__", lambda: None)() + del self._cache + self._cache = None + logger.info("🗑️ ExllamaCompletionGenerator cache deleted") + + +@overload +def _encode( + tokenizer: ExLlamaTokenizer, + text: str, + return_mask: bool = False, +) -> Tensor: + ... + + +@overload +def _encode( + tokenizer: ExLlamaTokenizer, + text: List[str], + return_mask: bool = True, +) -> Tuple[Tensor, Tensor]: + ... + + +def _encode( + tokenizer: ExLlamaTokenizer, + text: Union[str, List[str]], + return_mask: bool = False, +) -> Union[Tensor, Tuple[Tensor, Tensor]]: + """Encode a text string into a tensor.""" + result = tokenizer.encode(text, return_mask=return_mask) + if return_mask: + ids, mask = result + assert isinstance(ids, Tensor) and isinstance(mask, Tensor) + return ids, mask + else: + ids = result[0] if isinstance(result, tuple) else result + assert isinstance(ids, Tensor) + return ids diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index 3aa49b6..77a2df1 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -1,6 +1,12 @@ """Wrapper for llama_cpp to generate text completions.""" from inspect import signature -from typing import Dict, Iterator, List, Literal, Optional, Union +from typing import ( # noqa: F401 + Callable, + Iterator, + List, + Optional, + Union, +) from ..schemas.api import ( APIChatMessage, @@ -29,46 +35,23 @@ from repositories.llama_cpp import llama_cpp -def _make_logit_bias_processor( - llama: llama_cpp.Llama, - logit_bias: Dict[str, float], - logit_bias_type: Optional[Literal["input_ids", "tokens"]], -): - """Create a logit bias processor to bias the logit scores.""" - if logit_bias_type is None: - logit_bias_type = "input_ids" - - to_bias: Dict[int, float] = {} - if logit_bias_type == "input_ids": - for input_id_string, score in logit_bias.items(): - to_bias[int(input_id_string)] = score - - elif logit_bias_type == "tokens": - for token, score in logit_bias.items(): - for input_id in llama.tokenize( - token.encode("utf-8"), add_bos=False - ): - to_bias[input_id] = score - - def logit_bias_processor( - input_ids: List[int], - scores: List[float], - ) -> List[float]: - new_scores: List[float] = [0.0] * len(scores) - for input_id, score in enumerate(scores): - new_scores[input_id] = score + to_bias.get(input_id, 0.0) - - return new_scores - - return logit_bias_processor - - def _create_completion( client: llama_cpp.Llama, prompt: str, stream: bool, settings: TextGenerationSettings, ) -> Union[Completion, Iterator[CompletionChunk]]: + logit_processors = llama_cpp.LogitsProcessorList( + [ + processor.without_torch + for processor in BaseCompletionGenerator.get_logit_processors( + settings=settings, + encoder=lambda s: client.tokenize( + s.encode("utf-8"), add_bos=False + ), + ) + ] + ) return client.create_completion( # type: ignore stream=stream, prompt=prompt, @@ -85,17 +68,7 @@ def _create_completion( mirostat_mode=settings.mirostat_mode, mirostat_tau=settings.mirostat_tau, mirostat_eta=settings.mirostat_eta, - logits_processor=llama_cpp.LogitsProcessorList( - [ - _make_logit_bias_processor( - client, - settings.logit_bias, - settings.logit_bias_type, - ), - ] - ) - if settings.logit_bias is not None - else None, + logits_processor=logit_processors if logit_processors else None, stop=settings.stop, ) @@ -109,40 +82,16 @@ def _create_chat_completion( prompt: str = LlamaCppCompletionGenerator.convert_messages_into_prompt( messages, settings=settings ) - completion_or_chunks = client( - prompt=prompt, - temperature=settings.temperature, - top_p=settings.top_p, - top_k=settings.top_k, - stream=stream, - max_tokens=settings.max_tokens, - repeat_penalty=settings.repeat_penalty, - presence_penalty=settings.presence_penalty, - frequency_penalty=settings.frequency_penalty, - tfs_z=settings.tfs_z, - mirostat_mode=settings.mirostat_mode, - mirostat_tau=settings.mirostat_tau, - mirostat_eta=settings.mirostat_eta, - logits_processor=llama_cpp.LogitsProcessorList( - [ - _make_logit_bias_processor( - client, - settings.logit_bias, - settings.logit_bias_type, - ), - ] - ) - if settings.logit_bias is not None - else None, - stop=settings.stop, + completion_or_chunks = _create_completion( + client=client, prompt=prompt, stream=stream, settings=settings ) if isinstance(completion_or_chunks, Iterator): return convert_text_completion_chunks_to_chat( - completion_or_chunks, # type: ignore + completion_or_chunks, ) else: return convert_text_completion_to_chat( - completion_or_chunks, # type: ignore + completion_or_chunks, ) @@ -294,12 +243,12 @@ def generate_chat_completion_with_streaming( return # the generator was interrupted yield chunk - def encode(self, text: str, add_bos: bool = True) -> List[int]: + def encode(self, text: str, add_bos: bool = True, **kwargs) -> List[int]: assert self.client is not None, "Client is not initialized" return self.client.tokenize( text.encode("utf-8", errors="ignore"), add_bos=add_bos ) - def decode(self, tokens: List[int]) -> str: + def decode(self, ids: List[int], **kwargs) -> str: assert self.client is not None, "Client is not initialized" - return self.client.detokenize(tokens).decode("utf-8", errors="ignore") + return self.client.detokenize(ids).decode("utf-8", errors="ignore") diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py index 45a08e3..93f8dab 100644 --- a/llama_api/schemas/api.py +++ b/llama_api/schemas/api.py @@ -46,7 +46,7 @@ class APIChatMessage(BaseModel): ) class Config: - from_attributes = True + frozen = True class TextGenerationSettings(BaseModel): @@ -114,7 +114,6 @@ class TextGenerationSettings(BaseModel): "t so far, decreasing the model's likelihood to repeat the same line verbatim." ), ) - presence_penalty: float = Field( default=0.0, ge=-2.0, @@ -136,6 +135,13 @@ class TextGenerationSettings(BaseModel): "9) will be more lenient." ), ) + repetition_penalty_range: int = Field( + default=0, + ge=0, + description=( + "The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used." + ), + ) top_k: int = Field( default=40, ge=0, @@ -195,6 +201,24 @@ class TextGenerationSettings(BaseModel): default=False, description="If True, the EOS token is banned from being generated.", ) + muse: bool = Field( + default=False, + description="Use Muse logit processor (experimental). " + "Muse logit processor performs dampening of the k highest probability elements.", + ) + guidance_scale: float = Field( + default=1.0, + ge=1.0, + description="The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`. " + "Higher guidance scale encourages the model to generate samples that are more closely linked to the input " + "prompt, usually at the expense of poorer quality", + ) + negative_prompt: Optional[str] = Field( + default=None, + description="The negative prompt for classifier free guidance (CFG). " + "The negative prompt is used to encourage the model not to generate samples that are too similar to the " + "negative prompt. CFG is enabled by setting `guidance_scale > 1`.", + ) class CreateEmbeddingRequest(BaseModel): @@ -243,6 +267,12 @@ class CreateChatCompletionRequest(TextGenerationSettings): stream: bool = Field( default=False, description="Whether to stream the response." ) + functions: Optional[FunctionProperty] = Field( + default=None, description="The functions to invoke." + ) + function_call: Optional[ + Union[FunctionProperty, Literal["auto", "none"]] + ] = Field(default=None, description="The function call to invoke.") class Config: json_schema_extra = { diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py index 4704834..4fa59f5 100644 --- a/llama_api/schemas/models.py +++ b/llama_api/schemas/models.py @@ -101,6 +101,14 @@ class ExllamaModel(BaseLLMModel): "window size from 2048 to 4096, set this to 2.0." }, ) + alpha_value: Optional[float] = field( + default=None, + metadata={ + "description": "Positional embeddings alpha factor for " + "NTK RoPE scaling. Use either this or compress_pos_emb, " + "not both at the same time." + }, + ) gpu_peer_fix: bool = field( default=False, metadata={ diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py index 75f3a2b..eca2af2 100644 --- a/llama_api/server/pools/llama.py +++ b/llama_api/server/pools/llama.py @@ -94,8 +94,6 @@ def get_completion_generator( ) if body.model in openai_replacement_models: body.model = openai_replacement_models[body.model] - if not isinstance(body, CreateEmbeddingRequest): - body.logit_bias = None # Check if the model is defined in LLMModels enum llm_model = get_model(body.model) @@ -111,17 +109,13 @@ def get_completion_generator( # Before creating new one, deallocate embeddings to free up memory if embedding_generators: free_memory_of_first_item_from_container( - embedding_generators, - min_free_memory_mb=512, - logger=logger, + embedding_generators, logger=logger ) # Before creating a new completion generator, check memory usage if completion_generators.maxlen == len(completion_generators): free_memory_of_first_item_from_container( - completion_generators, - min_free_memory_mb=256, - logger=logger, + completion_generators, logger=logger ) # Create a new completion generator @@ -167,16 +161,12 @@ def get_embedding_generator( # Before creating a new completion generator, check memory usage if embedding_generators.maxlen == len(embedding_generators): free_memory_of_first_item_from_container( - embedding_generators, - min_free_memory_mb=256, - logger=logger, + embedding_generators, logger=logger ) # Before creating a new, deallocate embeddings to free up memory if completion_generators: free_memory_of_first_item_from_container( - completion_generators, - min_free_memory_mb=512, - logger=logger, + completion_generators, logger=logger ) if "sentence" in body.model and "encoder" in body.model: diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index f1580b3..f36a328 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -167,7 +167,7 @@ def error_message_wrapper( return 500, ErrorResponse( message=str(error), type="internal_server_error", - param=f"traceback:: {parse_trackback(error)}", + param=f"traceback:: {parse_traceback(error)}", code=type(error).__name__, ) @@ -255,7 +255,7 @@ async def custom_route_handler(self, request: Request) -> Response: ) -def parse_trackback(exception: Exception) -> str: +def parse_traceback(exception: Exception) -> str: """Parses traceback information from the exception""" if ( exception.__traceback__ is not None diff --git a/llama_api/utils/process_pool.py b/llama_api/utils/process_pool.py index ecaed7a..c3358d4 100644 --- a/llama_api/utils/process_pool.py +++ b/llama_api/utils/process_pool.py @@ -9,6 +9,7 @@ from multiprocessing import Process, Queue, cpu_count from threading import Thread from time import sleep +from traceback import format_exception from types import TracebackType from typing import ( Any, @@ -22,11 +23,15 @@ Union, ) +from llama_api.utils.logger import ApiLogger + if sys.version_info >= (3, 10): from typing import ParamSpec else: from typing_extensions import ParamSpec +logger = ApiLogger(__name__) + class _WrappedWorkerException(Exception): # type: ignore def __init__( @@ -146,6 +151,9 @@ def _worker_job_loop( except Exception as e: # If it fails, we need to send the exception back error = _WrappedWorkerException(str(e), e.__class__.__name__) + logger.error( + "".join(format_exception(e.__class__, e, e.__traceback__)) + ) result = None try: # We're using pickle to serialize the result @@ -261,6 +269,9 @@ def result(self) -> Optional[Tuple[Any, Exception]]: unwrapped_err = err.exception unwrapped_err.__traceback__ = err.traceback err = unwrapped_err + logger.error( + f"Error in worker process: {err.__class__.__name__}: {err}" + ) return ret, err except queue.Empty: if not self.process.is_alive(): diff --git a/llama_api/utils/system.py b/llama_api/utils/system.py index f2f4410..78ca571 100644 --- a/llama_api/utils/system.py +++ b/llama_api/utils/system.py @@ -162,7 +162,7 @@ def empty_cache(): logger.warning( ( f"RAM + VRAM usage did not decrease " - "by at least {min_free_memory_mb} MB " + f"by at least {min_free_memory_mb} MB " "after removing the oldest object.\n" "This may indicate a memory leak.\n" f"- Memory usage before: {mem_usage_before} MB\n" From 9a726b4a9eae1dd1b81d454a3b0e970d54da9af9 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 13 Aug 2023 11:39:57 +0900 Subject: [PATCH 02/15] Added xformers --- llama_api/modules/base.py | 6 +- llama_api/modules/exllama.py | 34 +++-- llama_api/modules/llama_cpp.py | 46 ++---- llama_api/modules/xformers.py | 133 ++++++++++++++++++ llama_api/schemas/api.py | 2 +- .../{function_calling.py => function_call.py} | 0 llama_api/schemas/models.py | 23 ++- llama_api/server/app_settings.py | 9 +- llama_api/server/routers/v1.py | 2 +- llama_api/utils/concurrency.py | 11 +- llama_api/utils/dependency.py | 4 +- llama_api/utils/errors.py | 2 +- main.py | 19 ++- 13 files changed, 222 insertions(+), 69 deletions(-) create mode 100644 llama_api/modules/xformers.py rename llama_api/schemas/{function_calling.py => function_call.py} (100%) diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py index 6bd6286..09b8291 100644 --- a/llama_api/modules/base.py +++ b/llama_api/modules/base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from dataclasses import dataclass +from dataclasses import asdict, dataclass from typing import Any, Iterator, List, TypeVar from llama_api.mixins.logits import LogitsMixin @@ -25,6 +25,10 @@ class BaseLLMModel: model_path: str = "/path/to/model" max_total_tokens: int = 2048 + @property + def asdict(self) -> dict: + return asdict(self) + class BaseCompletionGenerator( ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index 7fb0a0f..293682d 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -1,4 +1,19 @@ """Wrapper for exllama to generate text completions.""" +# flake8: noqa +from os import environ + +from ..utils.logger import ApiLogger + +logger = ApiLogger(__name__) +if environ.get("LLAMA_API_XFORMERS") == "1": + try: + from ..modules.xformers import hijack_attention_forward + + hijack_attention_forward() + except Exception as e: + logger.warning( + f"xformers mode is enabled, but xformers is not installed: {e}" + ) from pathlib import Path from typing import ( TYPE_CHECKING, @@ -24,8 +39,6 @@ make_completion_chunk, ) from ..utils.dependency import import_repository -from ..utils.logger import ApiLogger -from ..utils.path import resolve_model_path_to_posix from ..utils.system import deallocate_memory from .base import BaseCompletionGenerator @@ -48,7 +61,7 @@ ) assert cuda.is_available(), "CUDA must be available to use ExLlama." -logger = ApiLogger(__name__) + _stop_checker = BaseCompletionGenerator.is_possible_to_generate_stops @@ -126,7 +139,7 @@ def _apply_settings_to_generator( ) -> ExLlamaGenerator: """Apply the settings to the generator.""" # Make sure that the batch size is correct - required_batch_size = 1 if settings.guidance_scale == 1 else 2 + required_batch_size = 1 if settings.guidance_scale <= 1 else 2 cache_batch_size = cg.cache.batch_size # type: int if cache_batch_size != required_batch_size: cg._cache = None @@ -159,7 +172,9 @@ def _gen_single_token_with_cfg( generator: ExLlamaGenerator, mask: Tensor, cfg_alpha: float ) -> int: logits = generator.model.forward( - generator.sequence[:, -1:], cache=generator.cache, input_mask=mask + generator.sequence[:, -1:], + cache=generator.cache, + input_mask=mask, ) # type: Tensor # type: ignore generator.apply_rep_penalty(logits) probs = log_softmax(logits, dim=-1) @@ -183,7 +198,7 @@ def _gen_single_token_without_cfg( if generator.sequence is not None: logits = generator.model.forward( generator.sequence[:, -1:], - generator.cache, + cache=generator.cache, lora=generator.lora, input_mask=mask, ) # type: Tensor # type: ignore @@ -375,12 +390,7 @@ def from_pretrained( cls, llm_model: "ExllamaModel" ) -> "ExllamaCompletionGenerator": result = cls() - model_folder_path = Path( - resolve_model_path_to_posix( - llm_model.model_path, - default_relative_directory="models/gptq", - ) - ) + model_folder_path = Path(llm_model.model_path_resolved) result._config = _make_config(model_folder_path, llm_model) result._tokenizer = ExLlamaTokenizer( (model_folder_path / "tokenizer.model").as_posix() diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index 77a2df1..10501b9 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -1,6 +1,7 @@ """Wrapper for llama_cpp to generate text completions.""" from inspect import signature from typing import ( # noqa: F401 + TYPE_CHECKING, Callable, Iterator, List, @@ -23,7 +24,6 @@ ) from ..utils.dependency import import_repository from ..utils.logger import ApiLogger -from ..utils.path import resolve_model_path_to_posix from .base import BaseCompletionGenerator logger = ApiLogger(__name__) @@ -122,40 +122,16 @@ def llm_model(self) -> "LlamaCppModel": def from_pretrained( cls, llm_model: "LlamaCppModel" ) -> "LlamaCppCompletionGenerator": - additional_kwargs = {} - arg_keys = signature(llama_cpp.Llama.__init__).parameters.keys() - if "rope_freq_base" in arg_keys: - additional_kwargs.update( - {"rope_freq_base": llm_model.rope_freq_base}, - ) - if "rope_freq_scale" in arg_keys: - additional_kwargs.update( - {"rope_freq_scale": llm_model.rope_freq_scale} - ) - client = llama_cpp.Llama( - model_path=resolve_model_path_to_posix( - llm_model.model_path, - default_relative_directory="models/ggml", - ), - n_ctx=llm_model.max_total_tokens, - n_parts=llm_model.n_parts, - n_gpu_layers=llm_model.n_gpu_layers, - seed=llm_model.seed, - f16_kv=llm_model.f16_kv, - logits_all=llm_model.logits_all, - vocab_only=llm_model.vocab_only, - use_mmap=llm_model.use_mmap, - use_mlock=llm_model.use_mlock, - embedding=llm_model.embedding, - n_threads=llm_model.n_threads, - n_batch=llm_model.n_batch, - last_n_tokens_size=llm_model.last_n_tokens_size, - lora_base=llm_model.lora_base, - lora_path=llm_model.lora_path, - low_vram=llm_model.low_vram, - verbose=llm_model.echo, - **additional_kwargs, - ) + kwargs = { + # Get all attributes of llm_model + key: value + for key, value in llm_model.asdict.items() + # Hacky way to pass arguments to older versions of llama-cpp-python + if key in signature(llama_cpp.Llama.__init__).parameters.keys() + } + kwargs["model_path"] = llm_model.model_path_resolved + kwargs["verbose"] = llm_model.verbose and llm_model.echo + client = llama_cpp.Llama(**kwargs) if llm_model.cache: cache_type = llm_model.cache_type if cache_type is None: diff --git a/llama_api/modules/xformers.py b/llama_api/modules/xformers.py new file mode 100644 index 0000000..16d5695 --- /dev/null +++ b/llama_api/modules/xformers.py @@ -0,0 +1,133 @@ +# flake8: noqa +import math +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import transformers.models.llama.modeling_llama +from xformers.ops import memory_efficient_attention, LowerTriangularMask +from torch import Tensor, cat, finfo, float32, matmul, softmax, tensor + +from ..utils.logger import ApiLogger + +if TYPE_CHECKING: + from transformers.models.llama.modeling_llama import LlamaAttention + + +logger = ApiLogger(__name__) + + +def hijack_attention_forward(): + transformers.models.llama.modeling_llama.LlamaAttention.forward = _forward + logger.info(f"Replaced attention forward with {__name__.split('.')[-1]}") + + +def _forward( + self: "LlamaAttention", + hidden_states: Tensor, + attention_mask: Optional[Tensor] = None, + position_ids: Optional[Tensor] = None, + past_key_value: Optional[Tuple[Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: + # COPY: oobabooga/text-generation-webui/modules/llama_attn_hijack.py + logger.info(f"Using {__name__.split('.')[-1]}") + bsz, q_len, _ = hidden_states.size() + + query_states = ( + self.q_proj(hidden_states) + .view(bsz, q_len, self.num_heads, self.head_dim) + .transpose(1, 2) + ) + key_states = ( + self.k_proj(hidden_states) + .view(bsz, q_len, self.num_heads, self.head_dim) + .transpose(1, 2) + ) + value_states = ( + self.v_proj(hidden_states) + .view(bsz, q_len, self.num_heads, self.head_dim) + .transpose(1, 2) + ) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + ( + query_states, + key_states, + ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = cat([past_key_value[0], key_states], dim=2) + value_states = cat([past_key_value[1], value_states], dim=2) # type: ignore + + past_key_value = (key_states, value_states) if use_cache else None # type: ignore + + # We only apply xformers optimizations if we don't need to output the whole attention matrix + if not output_attentions: + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros. + # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros. + if attention_mask is None or attention_mask[0, 0, 0, 1] == 0: + # input and output should be of form (bsz, q_len, num_heads, head_dim) + attn_output = memory_efficient_attention( + query_states, key_states, value_states, attn_bias=None + ) + else: + # input and output should be of form (bsz, q_len, num_heads, head_dim) + attn_output = memory_efficient_attention( + query_states, + key_states, + value_states, + attn_bias=LowerTriangularMask(), + ) + attn_weights = None + else: + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3) + ) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max( + attn_weights, tensor(finfo(attn_weights.dtype).min) + ) + + # upcast attention to fp32 + attn_weights = softmax(attn_weights, dim=-1, dtype=float32).to( + query_states.dtype + ) + attn_output = matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + return ( + self.o_proj(attn_output.reshape(bsz, q_len, self.hidden_size)), + attn_weights, + past_key_value, + ) diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py index 93f8dab..543c76c 100644 --- a/llama_api/schemas/api.py +++ b/llama_api/schemas/api.py @@ -8,7 +8,7 @@ from pydantic.main import BaseModel from typing_extensions import TypedDict -from .function_calling import FunctionProperty +from .function_call import FunctionProperty # If python version >= 3.11, use the built-in NotRequired type. # Otherwise, import it from typing_extensi diff --git a/llama_api/schemas/function_calling.py b/llama_api/schemas/function_call.py similarity index 100% rename from llama_api/schemas/function_calling.py rename to llama_api/schemas/function_call.py diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py index 4fa59f5..9e4027d 100644 --- a/llama_api/schemas/models.py +++ b/llama_api/schemas/models.py @@ -1,6 +1,9 @@ from dataclasses import dataclass, field +from functools import cached_property from typing import List, Literal, Optional +from llama_api.utils.path import resolve_model_path_to_posix + from ..modules.base import BaseLLMModel @@ -64,7 +67,8 @@ class LlamaCppModel(BaseLLMModel): cache: bool = ( False # The size of the cache in bytes. Only used if cache is True. ) - echo: bool = True # Whether to echo the prompt. + verbose: bool = True # Whether to echo the prompt. + echo: bool = True # Compatibility of verbose. lora_base: Optional[str] = None # The path to the Llama LoRA base model. lora_path: Optional[ str @@ -86,6 +90,16 @@ class LlamaCppModel(BaseLLMModel): # Refer: https://github.com/ggerganov/llama.cpp/pull/2054 rope_freq_base: float = 10000.0 # I use 26000 for n_ctx=4096. rope_freq_scale: float = 1.0 # Generally, 2048 / n_ctx. + n_gqa: Optional[int] = None # TEMPORARY: Set to 8 for Llama2 70B + rms_norm_eps: Optional[float] = None # TEMPORARY + mul_mat_q: Optional[bool] = None # TEMPORARY + + @cached_property + def model_path_resolved(self): + return resolve_model_path_to_posix( + self.model_path, + default_relative_directory="models/ggml", + ) @dataclass @@ -136,3 +150,10 @@ class ExllamaModel(BaseLLMModel): matmul_no_half2: bool = False silu_no_half2: bool = False concurrent_streams: bool = False + + @cached_property + def model_path_resolved(self): + return resolve_model_path_to_posix( + self.model_path, + default_relative_directory="models/gptq", + ) diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index bd472da..9c30a7f 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -105,6 +105,8 @@ def initialize_before_launch( # Get current packages installed logger.info(f"📦 Installed packages: {get_installed_packages()}") + if environ.get("LLAMA_API_XFORMERS") == "1": + install_package("xformers") else: logger.warning( "🏃‍♂️ Skipping package installation... " @@ -149,13 +151,12 @@ async def health(): def run( port: int, - max_workers: int = 1, install_packages: bool = False, force_cuda: bool = False, skip_pytorch_install: bool = False, skip_tensorflow_install: bool = False, skip_compile: bool = False, - api_key: Optional[str] = None, + environs: Optional[Dict[str, str]] = None, ) -> None: initialize_before_launch( git_and_disk_paths=Config.git_and_disk_paths, @@ -169,8 +170,8 @@ def run( from uvicorn import Config as UvicornConfig from uvicorn import Server as UvicornServer - environ["MAX_WORKERS"] = str(max_workers) - environ["API_KEY"] = api_key or "" + if environs: + environ.update(environs) UvicornServer( config=UvicornConfig( diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 6b9715f..9e8b7f7 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -87,7 +87,7 @@ class WixMetadata: # processing a request. This is used to prevent multiple requests from # creating multiple completion generators at the same time. wixs: Tuple[WixMetadata] = tuple( - WixMetadata() for _ in range(int(environ.get("MAX_WORKERS", 1))) + WixMetadata() for _ in range(int(environ.get("LLAMA_API_MAX_WORKERS", 1))) ) diff --git a/llama_api/utils/concurrency.py b/llama_api/utils/concurrency.py index 1b5d877..797a1be 100644 --- a/llama_api/utils/concurrency.py +++ b/llama_api/utils/concurrency.py @@ -36,13 +36,6 @@ def init_process_pool(env_vars: Dict[str, str]) -> None: for key, value in env_vars.items(): environ[key] = value - cuda_home = environ.get("CUDA_HOME", None) - cuda_path = environ.get("CUDA_PATH", None) - if cuda_path is not None and cuda_home is None: - environ["CUDA_HOME"] = cuda_path - elif cuda_home is not None and cuda_path is None: - environ["CUDA_PATH"] = cuda_home - def pool() -> ProcessPool: """Get the process pool, and initialize it if it's not initialized yet""" @@ -51,14 +44,14 @@ def pool() -> ProcessPool: if _pool is None: logger.info("Initializing process pool...") _pool = ProcessPool( - max_workers=int(environ.get("MAX_WORKERS", 1)), + max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)), initializer=init_process_pool, initargs=(dict(environ),), ) elif not _pool.is_available: logger.critical("🚨 Process pool died. Reinitializing process pool...") _pool = ProcessPool( - max_workers=int(environ.get("MAX_WORKERS", 1)), + max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)), initializer=init_process_pool, initargs=(dict(environ),), ) diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py index a0cb6c3..8dbee87 100644 --- a/llama_api/utils/dependency.py +++ b/llama_api/utils/dependency.py @@ -218,12 +218,12 @@ def import_repository(git_path: str, disk_path: str): sys.path.remove(str(disk_path)) -def install_package(package: str, force: bool = False) -> bool: +def install_package(package: str, *args, force: bool = False) -> bool: """Install a package with pip.""" if not force and is_package_available(package): return True return run_command( - [sys.executable, "-m", "pip", "install", package], + [sys.executable, "-m", "pip", "install", package, *args], action="install", name=package, ) diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index f36a328..dc7f4e2 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -126,7 +126,7 @@ class RouteErrorHandler(APIRoute): ): ErrorResponseFormatters.model_not_found, } - api_key: Optional[str] = environ.get("API_KEY", None) or None + api_key: Optional[str] = environ.get("LLAMA_API_API_KEY") or None @cached_property def authorization(self) -> Optional[str]: diff --git a/main.py b/main.py index 74a0f1e..9b8d7f7 100644 --- a/main.py +++ b/main.py @@ -41,6 +41,11 @@ action="store_true", help="Skip installing tensorflow, if `install-pkgs` is set", ) + parser.add_argument( + "--skip-compile", + action="store_true", + help="Skip compiling the shared library of LLaMA C++ code", + ) parser.add_argument( "-k", "--api-key", @@ -48,14 +53,24 @@ default=None, help="API key to use for the server", ) + parser.add_argument( + "-x", + "--xformers", + action="store_true", + help="Apply xformers' memory-efficient optimizations", + ) args = parser.parse_args() run( port=args.port, - max_workers=args.max_workers, install_packages=args.install_pkgs, force_cuda=args.force_cuda, skip_pytorch_install=args.skip_torch_install, skip_tensorflow_install=args.skip_tf_install, - api_key=args.api_key, + skip_compile=args.skip_compile, + environs={ + "LLAMA_API_MAX_WORKERS": str(args.max_workers), + "LLAMA_API_XFORMERS": "1" if args.xformers else "0", + "LLAMA_API_API_KEY": args.api_key or "", + }, ) From cfc18bf56e443e875dc629cc1df13d0e64bdc87d Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 13 Aug 2023 14:41:46 +0900 Subject: [PATCH 03/15] Test suite refactor --- pyproject.toml | 2 +- requirements.txt | 22 ++-- tests/conftest.py | 176 +++++++++++++++++++++++++- tests/test_server.py | 289 ++++++++++--------------------------------- 4 files changed, 249 insertions(+), 240 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a88a9f5..2d80eb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ python = ">=3.8.1,<3.12" poetry = "^1.5.1" uvicorn = { extras = ["standard"], version = "^0.23" } -fastapi = "^0.100" +fastapi = "^0.100.1" orjson = "^3.9" sse-starlette = "^1.6" psutil = "^5.9" diff --git a/requirements.txt b/requirements.txt index 75b8ca9..ac45196 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ cffi==1.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" and (s charset-normalizer==3.2.0 ; python_full_version >= "3.8.1" and python_version < "3.12" cleo==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12" click==8.1.6 ; python_full_version >= "3.8.1" and python_version < "3.12" -cmake==3.27.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +cmake==3.27.1 ; python_full_version >= "3.8.1" and python_version < "3.12" colorama==0.4.6 ; python_full_version >= "3.8.1" and python_version < "3.12" and (os_name == "nt" or platform_system == "Windows") crashtest==0.4.1 ; python_full_version >= "3.8.1" and python_version < "3.12" cryptography==41.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux" @@ -25,19 +25,19 @@ httptools==0.6.0 ; python_full_version >= "3.8.1" and python_version < "3.12" huggingface-hub==0.16.4 ; python_full_version >= "3.8.1" and python_version < "3.12" idna==3.4 ; python_full_version >= "3.8.1" and python_version < "3.12" importlib-metadata==6.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12" -importlib-resources==6.0.0 ; python_full_version >= "3.8.1" and python_version < "3.9" +importlib-resources==6.0.1 ; python_full_version >= "3.8.1" and python_version < "3.9" installer==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12" jaraco-classes==3.3.0 ; python_full_version >= "3.8.1" and python_version < "3.12" jeepney==0.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux" jsonschema-specifications==2023.7.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -jsonschema==4.18.4 ; python_full_version >= "3.8.1" and python_version < "3.12" +jsonschema==4.19.0 ; python_full_version >= "3.8.1" and python_version < "3.12" keyring==23.13.1 ; python_full_version >= "3.8.1" and python_version < "3.12" lockfile==0.12.2 ; python_full_version >= "3.8.1" and python_version < "3.12" -more-itertools==10.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +more-itertools==10.1.0 ; python_full_version >= "3.8.1" and python_version < "3.12" msgpack==1.0.5 ; python_full_version >= "3.8.1" and python_version < "3.12" ninja==1.11.1 ; python_full_version >= "3.8.1" and python_version < "3.12" numpy==1.24.4 ; python_full_version >= "3.8.1" and python_version < "3.12" -orjson==3.9.2 ; python_full_version >= "3.8.1" and python_version < "3.12" +orjson==3.9.4 ; python_full_version >= "3.8.1" and python_version < "3.12" packaging==23.1 ; python_full_version >= "3.8.1" and python_version < "3.12" pexpect==4.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12" pkginfo==1.9.6 ; python_full_version >= "3.8.1" and python_version < "3.12" @@ -46,7 +46,7 @@ platformdirs==3.10.0 ; python_full_version >= "3.8.1" and python_version < "3.12 poetry-core==1.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12" poetry-plugin-export==1.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12" poetry==1.5.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -protobuf==4.23.4 ; python_full_version >= "3.8.1" and python_version < "3.12" +protobuf==4.24.0 ; python_full_version >= "3.8.1" and python_version < "3.12" psutil==5.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12" ptyprocess==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12" pycparser==2.21 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux") @@ -58,12 +58,12 @@ python-dotenv==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12 pywin32-ctypes==0.2.2 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "win32" pyyaml==6.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12" rapidfuzz==2.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -referencing==0.30.0 ; python_full_version >= "3.8.1" and python_version < "3.12" -regex==2023.6.3 ; python_full_version >= "3.8.1" and python_version < "3.12" +referencing==0.30.2 ; python_full_version >= "3.8.1" and python_version < "3.12" +regex==2023.8.8 ; python_full_version >= "3.8.1" and python_version < "3.12" requests-toolbelt==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12" requests==2.31.0 ; python_full_version >= "3.8.1" and python_version < "3.12" rpds-py==0.9.2 ; python_full_version >= "3.8.1" and python_version < "3.12" -safetensors==0.3.1 ; python_full_version >= "3.8.1" and python_version < "3.12" +safetensors==0.3.2 ; python_full_version >= "3.8.1" and python_version < "3.12" secretstorage==3.3.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux" sentencepiece==0.1.99 ; python_full_version >= "3.8.1" and python_version < "3.12" shellingham==1.5.0.post1 ; python_full_version >= "3.8.1" and python_version < "3.12" @@ -75,9 +75,9 @@ tensorflow-hub==0.14.0 ; python_full_version >= "3.8.1" and python_version < "3. tokenizers==0.13.3 ; python_full_version >= "3.8.1" and python_version < "3.12" tomli==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.11" tomlkit==0.12.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -tqdm==4.65.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +tqdm==4.66.0 ; python_full_version >= "3.8.1" and python_version < "3.12" transformers==4.31.0 ; python_full_version >= "3.8.1" and python_version < "3.12" -trove-classifiers==2023.7.6 ; python_full_version >= "3.8.1" and python_version < "3.12" +trove-classifiers==2023.8.7 ; python_full_version >= "3.8.1" and python_version < "3.12" typing-extensions==4.7.1 ; python_full_version >= "3.8.1" and python_version < "3.12" urllib3==1.26.16 ; python_full_version >= "3.8.1" and python_version < "3.12" uvicorn[standard]==0.23.2 ; python_full_version >= "3.8.1" and python_version < "3.12" diff --git a/tests/conftest.py b/tests/conftest.py index e276030..e96096e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,21 +1,49 @@ +from asyncio import gather +from datetime import datetime import importlib import unittest from os import environ from pathlib import Path -from typing import TYPE_CHECKING, Dict, List, Type # noqa: F401 +from re import compile, sub +from typing import ( + TYPE_CHECKING, + AsyncIterator, + Dict, + Iterable, + List, + Literal, + Optional, + Tuple, + Union, +) + +from orjson import loads +from llama_api.schemas.api import ( + ChatCompletionChoice, + ChatCompletionChunk, + CompletionChoice, + CompletionChunk, + ModelList, +) from llama_api.server.app_settings import create_app_llama_cpp from llama_api.shared.config import Config -from llama_api.utils.dependency import install_package, is_package_available from llama_api.utils.concurrency import _pool +from llama_api.utils.dependency import install_package, is_package_available +from llama_api.utils.system import get_cuda_version if TYPE_CHECKING: + from typing import Type # noqa: F401 + from fastapi.testclient import TestClient # noqa: F401 - from httpx import AsyncClient # noqa: F401 + from httpx import AsyncClient, Response # noqa: F401 + + +EndPoint = Literal["completions", "chat/completions"] class TestLlamaAPI(unittest.TestCase): - ggml_model: str = "orca-mini-3b.ggmlv3.q4_1.bin" + ggml_model: str = "orca-mini-3b.ggmlv3.q4_0.bin" ggml_path: Path = Config.project_root / Path(f"models/ggml/{ggml_model}") gptq_model: str = "orca_mini_7b" @@ -37,9 +65,147 @@ def setUpClass(cls): "fastapi.testclient" ).TestClient # type: Type[TestClient] cls.app = create_app_llama_cpp() - environ["MAX_WORKERS"] = "2" + environ["LLAMA_API_MAX_WORKERS"] = "2" @classmethod def tearDownClass(cls): if _pool is not None: _pool.shutdown(wait=True) + + @property + def check_ggml(self) -> None: + if not self.ggml_path.exists(): + self.skipTest(f"No model in {self.ggml_path}") + + @property + def check_gptq(self) -> None: + if not self.gptq_path.exists(): + self.skipTest(f"No model in {self.gptq_path}") + + @property + def check_cuda(self) -> None: + if not get_cuda_version(): + self.skipTest("CUDA is not available") + + async def arequest_completion( + self, + model_names: Union[List[str], Tuple[str, ...]], + endpoints: Union[EndPoint, Iterable[EndPoint]], + ) -> Tuple[List[List[str]], List[datetime], List[datetime]]: + async with self.AsyncClient( + app=self.app, base_url="http://localhost", timeout=None + ) as client: + # Get models using the API + models = await self.get_models( + client=client, model_names=list(model_names) + ) # type: List[str] + + # Submit requests to the API and get responses + return await self.submit_streaming_requests( + client=client, + model_and_endpoints=zip( + models, + ( + [endpoints] * len(model_names) # type: ignore + if isinstance(endpoints, str) + else endpoints + ), + ), + ) + + async def get_models( + self, client: "AsyncClient", model_names: List[str] + ) -> List[str]: + # Get models using the API + model_resp: ModelList = (await client.get("/v1/models")).json() + models: List[str] = [] + for model_name in model_names: + model: Optional[str] = None + for model_data in model_resp["data"]: + if model_name in model_data["id"]: + model = sub(r"\(.*\)", "", model_data["id"]).strip() + break + self.assertTrue(model, f"Model {model_name} not found") + models.append(str(model)) + return models + + async def submit_streaming_requests( + self, + client: "AsyncClient", + model_and_endpoints: Iterable[Tuple[str, EndPoint]], + ) -> Tuple[List[List[str]], List[datetime], List[datetime]]: + async def send_request( + model: str, endpoint: EndPoint + ) -> Tuple[List[str], datetime, datetime]: + async with client.stream( + method="POST", + url=f"/v1/{endpoint}", + json=self.union( + {"model": model, "max_tokens": 50}, + {"stream": True}, + {"messages": self.messages} + if endpoint.startswith("chat") + else {"prompt": self.prompt}, + ), + headers={"Content-Type": "application/json"}, + ) as response: + response.raise_for_status() + start_at = datetime.now() + results = [] # type: List[str] + async for chunk in self.extract_json_from_streaming_response( + response + ): + self.assertIn("choices", chunk, "No choices in response") + choice = chunk["choices"][0] + if "delta" in choice and choice["delta"].get("content"): + results.append(choice["delta"]["content"]) + elif "text" in choice: + results.append(choice["text"]) + self.assertGreaterEqual(len(results), 1, "No result in response") + return results, start_at, datetime.now() + + tasks = [ + send_request(model, endpoint) + for model, endpoint in model_and_endpoints + ] + return tuple(zip(*await gather(*tasks))) # type: ignore + + def harvest_results( + self, models: List[str], responses: List["Response"] + ) -> List[str]: + results: List[str] = [] + for model, response in zip(models, responses): + self.assertEqual(response.status_code, 200) + choice: Union[ + CompletionChoice, ChatCompletionChoice + ] = response.json()["choices"][0] + if "message" in choice: + results.append(choice["message"]["content"]) + elif "text" in choice: + results.append(choice["text"]) + else: + raise ValueError(f"Unknown response: {response.json()}") + print(f"Result of {model}:", results[-1], end="\n\n", flush=True) + self.assertEqual(len(results), len(models)) + return results + + async def extract_json_from_streaming_response( + self, + response: "Response", + ) -> AsyncIterator[Union[CompletionChunk, ChatCompletionChunk]]: + """Extract json from streaming `httpx.Response`""" + regex_finder = compile(rb"data:\s*({.+?})\s*\r?\n\s*\r?\n").finditer + bytes_buffer = bytearray() + async for stream in response.aiter_bytes(): + bytes_buffer.extend(stream) + for match in regex_finder(bytes_buffer): + try: + json_data = loads(match.group(1)) + yield json_data + bytes_buffer.clear() + except Exception: + continue + + @staticmethod + def union(*dicts: Dict) -> Dict: + return {k: v for d in dicts for k, v in d.items()} diff --git a/tests/test_server.py b/tests/test_server.py index 8255b4a..f44c132 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -1,32 +1,11 @@ -import re -from asyncio import gather -from typing import ( - Awaitable, - Dict, - Iterable, - List, - Literal, - Optional, - Tuple, - Union, -) +# flake8: noqa import unittest -from llama_api.utils.system import get_cuda_version -from tests.conftest import TestLlamaAPI - -from llama_api.schemas.api import ( - ModelList, - ChatCompletionChoice, - CompletionChoice, -) - -EndPoint = Literal["completions", "chat/completions"] +from tests.conftest import TestLlamaAPI -class TestServer(TestLlamaAPI, unittest.IsolatedAsyncioTestCase): - """Test the FastAPI server - with basic health checks & LLM completion requests (with concurrency)""" +class TestServerBasic(TestLlamaAPI): + """Test the FastAPI server with basic health checks""" def test_health(self): """Test the health endpoint""" @@ -37,239 +16,103 @@ def test_health(self): ) self.assertEqual(response.status_code, 200) + def test_v1_models(self): + """Test the v1/models endpoint""" + with self.TestClient(app=self.app) as client: + response = client.get( + "/v1/models", + headers={"Content-Type": "application/json"}, + ) + self.assertEqual(response.status_code, 200) + def test_import_llama_cpp(self): try: - from llama_api.modules.llama_cpp import ( # noqa: F401 - LlamaCppCompletionGenerator, + from llama_api.modules.llama_cpp import ( + LlamaCppCompletionGenerator, # noqa: F401 ) except ImportError as e: self.fail(f"Failed to import module: {e}") - @unittest.skipIf( - get_cuda_version() is None, - reason="No CUDA found on this system", - ) def test_import_exllama(self): + self.check_cuda try: - from llama_api.modules.exllama import ( # noqa: F401 - ExllamaCompletionGenerator, + from llama_api.modules.exllama import ( + ExllamaCompletionGenerator, # noqa: F401 ) except ImportError as e: self.fail(f"Failed to import module: {e}") def test_import_sentence_encoder(self): try: - from llama_api.modules.sentence_encoder import ( # noqa: F401 - SentenceEncoderEmbeddingGenerator, + from llama_api.modules.sentence_encoder import ( + SentenceEncoderEmbeddingGenerator, # noqa: F401 ) except ImportError as e: self.fail(f"Failed to import module: {e}") def test_import_transformer(self): try: - from llama_api.modules.transformer import ( # noqa: F401 - TransformerEmbeddingGenerator, - ) + from llama_api.modules.transformer import ( + TransformerEmbeddingGenerator, # noqa: F401 + ) # except ImportError as e: self.fail(f"Failed to import module: {e}") - def test_v1_models(self): - """Test the v1/models endpoint""" - with self.TestClient(app=self.app) as client: - response = client.get( - "/v1/models", - headers={"Content-Type": "application/json"}, - ) - self.assertEqual(response.status_code, 200) - @unittest.skipIf( - not TestLlamaAPI.ggml_path.exists(), - reason=f"No model in {TestLlamaAPI.ggml_path}", - ) - def test_llama_cpp(self): +class TestServerAdvanced(TestLlamaAPI, unittest.IsolatedAsyncioTestCase): + """Test the FastAPI server with advanced completion tests""" + + async def test_llama_cpp(self): """Test the Llama CPP model completion endpoints""" - self._request_completion( - model_names=(self.ggml_model,), endpoints="completions" + self.check_ggml + model_names = (self.ggml_model, self.ggml_model) + responses, starts, ends = await self.arequest_completion( + model_names=model_names, + endpoints=("chat/completions", "completions"), ) - self._request_completion( - model_names=(self.ggml_model,), endpoints="chat/completions" + start_1, end_1 = starts[0], ends[0] + print(f"GGML response: {''.join(responses[0])}", flush=True) + start_2, end_2 = starts[1], ends[1] + print(f"GGML response: {''.join(responses[1])}", flush=True) + + self.assertTrue( + end_1 < start_2 or end_2 < start_1, + f"Synchronous completion failed: {end_1} < {start_2} and {end_2} < {start_1}", ) - @unittest.skipIf( - not TestLlamaAPI.gptq_path.exists(), - reason=f"No model in{TestLlamaAPI.gptq_path}", - ) - def test_exllama(self): + async def test_exllama(self): """Test the ExLLama model completion endpoints""" - self._request_completion( - model_names=(self.gptq_model,), endpoints="completions" - ) - self._request_completion( - model_names=(self.gptq_model,), endpoints="chat/completions" - ) - - @unittest.skipIf( - not TestLlamaAPI.ggml_path.exists(), - reason=f"No model in {TestLlamaAPI.ggml_path}", - ) - async def test_llama_cpp_concurrency(self): - """Test the Llama CPP model completion endpoints with concurrency""" - model_names: Tuple[str, ...] = (self.ggml_model, self.ggml_model) - await self._arequest_completion( - model_names=model_names, endpoints="completions" + self.check_gptq + model_names = (self.gptq_model, self.gptq_model) + responses, starts, ends = await self.arequest_completion( + model_names=model_names, + endpoints=("chat/completions", "completions"), ) - - @unittest.skipIf( - not TestLlamaAPI.gptq_path.exists(), - reason=f"No model in {TestLlamaAPI.gptq_path}", - ) - async def test_exllama_concurrency(self): - """Test the ExLLama model completion endpoints with concurrency""" - model_names: Tuple[str, ...] = (self.gptq_model, self.gptq_model) - await self._arequest_completion( - model_names=model_names, endpoints="completions" + start_1, end_1 = starts[0], ends[0] + print(f"GPTQ response: {''.join(responses[0])}", flush=True) + start_2, end_2 = starts[1], ends[1] + print(f"GPTQ response: {''.join(responses[1])}", flush=True) + + self.assertTrue( + end_1 < start_2 or end_2 < start_1, + f"Synchronous completion failed: {end_1} < {start_2} and {end_2} < {start_1}", ) - @unittest.skipIf( - (not TestLlamaAPI.ggml_path.exists()) - or (not TestLlamaAPI.gptq_path.exists()), - f"No model in {TestLlamaAPI.ggml_path} or {TestLlamaAPI.gptq_path}", - ) async def test_llama_mixed_concurrency(self): """Test the Llama CPP & ExLLama model completion endpoints with concurrency""" - model_names: Tuple[str, ...] = (self.ggml_model, self.gptq_model) - await self._arequest_completion( + self.check_ggml + self.check_gptq + model_names = (self.ggml_model, self.gptq_model) + responses, starts, ends = await self.arequest_completion( model_names=model_names, endpoints="completions" ) - - async def _arequest_completion( - self, - model_names: Union[List[str], Tuple[str, ...]], - endpoints: Union[EndPoint, Iterable[EndPoint]], - ): - _endpoints: Iterable[str] = ( - [endpoints] * len(model_names) - if isinstance(endpoints, str) - else endpoints - ) - async with self.AsyncClient( - app=self.app, base_url="http://localhost", timeout=None - ) as client: - # Get models using the API - model_resp: ModelList = (await client.get("/v1/models")).json() - models: List[str] = [] - for model_name in model_names: - model: Optional[str] = None - for model_data in model_resp["data"]: - if model_name in model_data["id"]: - model = re.sub(r"\(.*\)", "", model_data["id"]).strip() - break - self.assertTrue(model, f"Model {model_name} not found") - models.append(str(model)) - - # Submit requests to the API - tasks: List[Awaitable] = [] - for model, endpoint in zip(models, _endpoints): - request = {"model": model, "max_tokens": 50} - request_message = ( - {"messages": self.messages} - if endpoint.startswith("chat") - else {"prompt": self.prompt} - ) - tasks.append( - client.post( - f"/v1/{endpoint}", - json=_union( - request, {"stream": False}, request_message - ), - headers={"Content-Type": "application/json"}, - timeout=None, - ) - ) - - # Wait for responses - cmpl_resps: List = await gather(*tasks) - results: List[str] = [] - for model, cmpl_resp in zip(models, cmpl_resps): - self.assertEqual(cmpl_resp.status_code, 200) - choice: Union[ - CompletionChoice, ChatCompletionChoice - ] = cmpl_resp.json()["choices"][0] - if "message" in choice: - results.append(choice["message"]["content"]) - elif "text" in choice: - results.append(choice["text"]) - else: - raise ValueError(f"Unknown response: {cmpl_resp.json()}") - print( - f"Result of {model}:", results[-1], end="\n\n", flush=True - ) - - self.assertEqual(len(results), len(models)) - - def _request_completion( - self, - model_names: Union[List[str], Tuple[str, ...]], - endpoints: Union[EndPoint, Iterable[EndPoint]], - ): - _endpoints: Iterable[str] = ( - [endpoints] * len(model_names) - if isinstance(endpoints, str) - else endpoints + start_1, end_1 = starts[0], ends[0] + print(f"GGML response: {''.join(responses[0])}", flush=True) + start_2, end_2 = starts[1], ends[1] + print(f"GPTQ response: {''.join(responses[1])}", flush=True) + + self.assertTrue( + start_2 < end_1 or start_1 < end_2, + f"Asynchronous completion failed: {start_1} < {end_2} and {start_2} < {end_1}", ) - with self.TestClient(app=self.app) as client: - # Get models using the API - model_resp = (client.get("/v1/models")).json() - models: List[str] = [] - for model_name in model_names: - model: Optional[str] = None - for model_data in model_resp["data"]: - if model_name in model_data["id"]: - model = re.sub(r"\(.*\)", "", model_data["id"]).strip() - break - self.assertTrue(model, f"Model {model_name} not found") - models.append(str(model)) - - # Submit requests to the API - cmpl_resps: List = [] - for model, endpoint in zip(models, _endpoints): - request = {"model": model, "max_tokens": 50} - request_message = ( - {"messages": self.messages} - if endpoint.startswith("chat") - else {"prompt": self.prompt} - ) - cmpl_resps.append( - client.post( - f"/v1/{endpoint}", - json=_union( - request, {"stream": False}, request_message - ), - headers={"Content-Type": "application/json"}, - timeout=None, - ) - ) - - # Wait for responses - results: List[str] = [] - for model, cmpl_resp in zip(models, cmpl_resps): - self.assertEqual(cmpl_resp.status_code, 200) - choice: Union[ - CompletionChoice, ChatCompletionChoice - ] = cmpl_resp.json()["choices"][0] - if "message" in choice: - results.append(choice["message"]["content"]) - elif "text" in choice: - results.append(choice["text"]) - else: - raise ValueError(f"Unknown response: {cmpl_resp.json()}") - print( - f"Result of {model}:", results[-1], end="\n\n", flush=True - ) - - self.assertEqual(len(results), len(models)) - - -def _union(*dicts: Dict) -> Dict: - return {k: v for d in dicts for k, v in d.items()} From 72d21f43d484053435f7f66d66f4db1dec9ca45b Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 13 Aug 2023 15:48:00 +0900 Subject: [PATCH 04/15] Added persistent docker compose file --- docker-compose.persistent.yml | 47 +++++++++++++++++++++++++++++++++++ docker-compose.yml | 5 ++-- 2 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 docker-compose.persistent.yml diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml new file mode 100644 index 0000000..4b0d767 --- /dev/null +++ b/docker-compose.persistent.yml @@ -0,0 +1,47 @@ +version: '3.8' + +volumes: + llama-api-models: + +services: + llama-api: + image: cosogi/llama-api:latest + entrypoint: ["python3", "-m", "main", "--port", "8000"] + environment: + - LLAMA_API_MAX_WORKERS=1 + - LLAMA_API_API_KEY= + volumes: + - llama-api-models:/app/models + - ./model_definitions.py:/app/model_definitions.py + - ./main.py:/app/main.py + ports: + - 8000:8000 + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] + + +# services: +# llama-api: +# build: +# context: . +# dockerfile: Dockerfile +# entrypoint: ["python3", "-m", "main", "--port", "8000"] +# environment: +# - LLAMA_API_MAX_WORKERS=1 +# - LLAMA_API_API_KEY= +# volumes: +# - llama-api-models:/app/models +# - ./model_definitions.py:/app/model_definitions.py +# - ./main.py:/app/main.py +# ports: +# - 8000:8000 +# deploy: +# resources: +# reservations: +# devices: +# - driver: nvidia +# capabilities: [gpu] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index c87a432..d96c5d0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,10 +2,11 @@ version: '3' services: llama-api: - image: cosogi/llama-api:230730 + image: cosogi/llama-api:latest entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - - MAX_WORKERS=1 + - LLAMA_API_MAX_WORKERS=1 + - LLAMA_API_API_KEY= volumes: - ./models:/app/models - ./llama_api:/app/llama_api From 05f6108d8be177f32030fe91443eaa76f82e14f9 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 13 Aug 2023 17:08:05 +0900 Subject: [PATCH 05/15] Support caching model path --- .gitignore | 1 + llama_api/schemas/models.py | 7 ++-- llama_api/utils/path.py | 73 +++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 2b48ff4..4836eb0 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ repositories/ .venv/ .vscode/ .test-venv/ +.temp/ PRIVATE_* \ No newline at end of file diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py index 9e4027d..e622438 100644 --- a/llama_api/schemas/models.py +++ b/llama_api/schemas/models.py @@ -2,9 +2,8 @@ from functools import cached_property from typing import List, Literal, Optional -from llama_api.utils.path import resolve_model_path_to_posix - from ..modules.base import BaseLLMModel +from ..utils.path import path_resolver @dataclass @@ -96,7 +95,7 @@ class LlamaCppModel(BaseLLMModel): @cached_property def model_path_resolved(self): - return resolve_model_path_to_posix( + return path_resolver( self.model_path, default_relative_directory="models/ggml", ) @@ -153,7 +152,7 @@ class ExllamaModel(BaseLLMModel): @cached_property def model_path_resolved(self): - return resolve_model_path_to_posix( + return path_resolver( self.model_path, default_relative_directory="models/gptq", ) diff --git a/llama_api/utils/path.py b/llama_api/utils/path.py index 880e99e..84f6f9d 100644 --- a/llama_api/utils/path.py +++ b/llama_api/utils/path.py @@ -1,7 +1,9 @@ +import orjson from pathlib import Path from re import compile from typing import List, Literal, Optional + from ..shared.config import Config from ..utils.huggingface_downloader import ( Classification, @@ -9,6 +11,7 @@ ) from ..utils.logger import ApiLogger + logger = ApiLogger(__name__) @@ -181,8 +184,78 @@ def resolve_model_path_to_posix( logger.info(f"`{path.name}` found in {parent_dir}") return (parent_dir / model_path).resolve().as_posix() + if model_path.count("/") != 1: + raise FileNotFoundError( + f"`{model_path}` not found in any of the following " + f"directories: {parent_dir_candidates}" + ) # Try to resolve the model path from Huggingface return HuggingfaceResolver(model_path).resolve() except Exception as e: logger.error(f"Error resolving model path: {e}") raise e + + +def resolve_model_path_to_posix_with_cache( + model_path: str, + default_relative_directory: Optional[str] = None, +) -> str: + """Resolve a model path to a POSIX path, with caching.""" + from filelock import FileLock, Timeout + + cache_file = Path(".temp/model_paths.json") + cache_file.parent.mkdir(parents=True, exist_ok=True) + try: + with FileLock( + cache_file.with_suffix(".lock"), timeout=10 + ): # Set a timeout if necessary + # Read the cache + try: + with open(cache_file, "r") as f: + cache = orjson.loads(f.read()) + assert isinstance(cache, dict) + except Exception: + cache = {} + + resolved = cache.get(model_path) + if not (isinstance(resolved, str) or resolved is None): + raise TypeError( + f"Invalid cache entry for model path `{model_path}`: " + f"{resolved}" + ) + if not resolved: + resolved = resolve_model_path_to_posix( + model_path, default_relative_directory + ) + cache[model_path] = resolved + + # Update the cache file + try: + with open(cache_file, "w") as f: + f.write(orjson.dumps(cache).decode()) + except Exception as e: + logger.error(f"Error writing model path cache: {e}") + return resolved + except (Timeout, TypeError) as e: + logger.warning( + "Error acquiring lock for model path cache" + + str(cache_file.with_suffix(".lock")) + + f": {e}" + ) + return resolve_model_path_to_posix( + model_path, default_relative_directory + ) + + +def path_resolver( + model_path: str, default_relative_directory: Optional[str] = None +) -> str: + """Resolve a model path to a POSIX path, with caching if possible.""" + try: + return resolve_model_path_to_posix_with_cache( + model_path, default_relative_directory + ) + except ImportError: + return resolve_model_path_to_posix( + model_path, default_relative_directory + ) From 681bfae8d31b584a4ad7b3e73219fab514e9a41c Mon Sep 17 00:00:00 2001 From: c0sogi Date: Mon, 14 Aug 2023 01:41:11 +0900 Subject: [PATCH 06/15] Fixed CUDA docker image build error --- Dockerfile | 14 +++++----- docker-compose.persistent.yml | 3 ++- docker-compose.yml | 44 ++++++++++++++++---------------- llama_api/modules/llama_cpp.py | 7 ++--- llama_api/server/app_settings.py | 20 +++------------ llama_api/shared/config.py | 6 +---- llama_api/utils/dependency.py | 14 +++++++--- llama_api/utils/llama_cpp.py | 37 ++++++++++++++++----------- main.py | 3 ++- 9 files changed, 74 insertions(+), 74 deletions(-) diff --git a/Dockerfile b/Dockerfile index 85ba958..7757736 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,17 +2,16 @@ ### Approximately 5 ~ 10 minutes to build # Select the required CUDA version. -ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" -FROM nvidia/cuda:${CUDA_IMAGE} +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 ENV PYTHON_VERSION="3.11.4" ENV PYTHON_VERSION_SHORT="3.11" -ENV HOST 0.0.0.0 -ENV PORT=8000 # Copy the necessary files. -COPY requirements.txt /app/requirements.txt -COPY pyproject.toml /app/pyproject.toml COPY llama_api /app/llama_api +COPY pyproject.toml /app/pyproject.toml +COPY requirements.txt /app/requirements.txt +COPY main.py /app/main.py +COPY model_definitions.py /app/model_definitions.py # Install the necessary applications, and then install Python. # Then, install the necessary Python packages(Dependencies). @@ -41,7 +40,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && apt-get clean \ && rm -rf /tmp/* \ && cd /app \ - && python3 -m llama_api.server.app_settings --force-cuda --install-pkgs + && python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda + # Need to skip complie because GPU access to host is not supported when building image. # Set the working directory and start the server. WORKDIR /app diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml index 4b0d767..971d8de 100644 --- a/docker-compose.persistent.yml +++ b/docker-compose.persistent.yml @@ -5,9 +5,10 @@ volumes: services: llama-api: - image: cosogi/llama-api:latest + image: cosogi/llama-api:230813 entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: + - FORCE_CUDA=1 - LLAMA_API_MAX_WORKERS=1 - LLAMA_API_API_KEY= volumes: diff --git a/docker-compose.yml b/docker-compose.yml index d96c5d0..0ad51af 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3' services: llama-api: - image: cosogi/llama-api:latest + image: cosogi/llama-api:230813 entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - LLAMA_API_MAX_WORKERS=1 @@ -24,24 +24,24 @@ services: capabilities: [gpu] # services: - # llama-api: - # build: - # context: . - # dockerfile: Dockerfile - # entrypoint: ["python3", "-m", "main", "--port", "8000"] - # environment: - # - MAX_WORKERS=1 - # volumes: - # - ./models:/app/models - # - ./llama_api:/app/llama_api - # - ./model_definitions.py:/app/model_definitions.py - # - ./main.py:/app/main.py - # - ./requirements.txt:/app/requirements.txt - # ports: - # - 8000:8000 - # deploy: - # resources: - # reservations: - # devices: - # - driver: nvidia - # capabilities: [gpu] \ No newline at end of file +# llama-api: +# build: +# context: . +# dockerfile: Dockerfile +# entrypoint: ["python3", "-m", "main", "--port", "8000"] +# environment: +# - MAX_WORKERS=1 +# volumes: +# - ./models:/app/models +# - ./llama_api:/app/llama_api +# - ./model_definitions.py:/app/model_definitions.py +# - ./main.py:/app/main.py +# - ./requirements.txt:/app/requirements.txt +# ports: +# - 8000:8000 +# deploy: +# resources: +# reservations: +# devices: +# - driver: nvidia +# capabilities: [gpu] \ No newline at end of file diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index 10501b9..7de518d 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -1,8 +1,6 @@ """Wrapper for llama_cpp to generate text completions.""" from inspect import signature -from typing import ( # noqa: F401 - TYPE_CHECKING, - Callable, +from typing import ( Iterator, List, Optional, @@ -23,14 +21,17 @@ convert_text_completion_to_chat, ) from ..utils.dependency import import_repository +from ..utils.llama_cpp import build_shared_lib from ..utils.logger import ApiLogger from .base import BaseCompletionGenerator logger = ApiLogger(__name__) logger.info("🦙 llama-cpp-python repository found!") +build_shared_lib(logger=logger) with import_repository( git_path="https://github.com/abetlen/llama-cpp-python", disk_path="repositories/llama_cpp", + options=["--recurse-submodules"], ): from repositories.llama_cpp import llama_cpp diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 9c30a7f..27abd75 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -3,13 +3,11 @@ from contextlib import asynccontextmanager from os import environ, getpid from pathlib import Path -from typing import Dict, Literal, Optional, Union +from typing import Dict, Literal, Optional -from ..shared.config import Config from ..utils.dependency import ( get_installed_packages, get_poetry_executable, - git_clone, install_all_dependencies, install_package, install_pytorch, @@ -67,7 +65,6 @@ def set_priority( def initialize_before_launch( - git_and_disk_paths: Optional[Dict[str, Union[str, Path]]] = None, install_packages: bool = False, force_cuda: bool = False, skip_pytorch_install: bool = False, @@ -75,14 +72,11 @@ def initialize_before_launch( skip_compile: bool = False, ) -> None: """Initialize the app""" - - # Git clone the repositories - if git_and_disk_paths is not None: - for git_path, disk_path in git_and_disk_paths.items(): - git_clone(git_path=git_path, disk_path=disk_path) - if install_packages: # Install all dependencies + if not skip_compile: + # Build the shared library of LLaMA C++ code + build_shared_lib(logger=logger, force_cuda=force_cuda) poetry = get_poetry_executable() if not poetry.exists(): # Install poetry @@ -99,10 +93,6 @@ def initialize_before_launch( project_paths = [Path(".")] + list(Path("repositories").glob("*")) install_all_dependencies(project_paths=project_paths) - if not skip_compile: - # Build the shared library of LLaMA C++ code - build_shared_lib(logger=logger) - # Get current packages installed logger.info(f"📦 Installed packages: {get_installed_packages()}") if environ.get("LLAMA_API_XFORMERS") == "1": @@ -159,7 +149,6 @@ def run( environs: Optional[Dict[str, str]] = None, ) -> None: initialize_before_launch( - git_and_disk_paths=Config.git_and_disk_paths, install_packages=install_packages, force_cuda=force_cuda, skip_pytorch_install=skip_pytorch_install, @@ -217,7 +206,6 @@ def run( args = parser.parse_args() initialize_before_launch( - git_and_disk_paths=Config.git_and_disk_paths, install_packages=args.install_pkgs, force_cuda=args.force_cuda, skip_pytorch_install=args.skip_torch_install, diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index 45c7d47..4ecd592 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Dict, List, Tuple, Union +from typing import List, Tuple class Config: @@ -11,10 +11,6 @@ class Config: torch_version: str = "==2.0.1" torch_source: str = "https://download.pytorch.org/whl/torch_stable.html" tensorflow_version: str = "==2.13.0" - git_and_disk_paths: Dict[str, Union[Path, str]] = { - "https://github.com/abetlen/llama-cpp-python": "repositories/llama_cpp", # noqa: E501 - "https://github.com/turboderp/exllama": "repositories/exllama", - } ggml_quanitzation_preferences_order: List[str] = [ "q4_K_M", "q4_K_S", diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py index 8dbee87..9fc4dd4 100644 --- a/llama_api/utils/dependency.py +++ b/llama_api/utils/dependency.py @@ -60,11 +60,15 @@ def is_package_available(package: str) -> bool: return True if find_spec(package) else False -def git_clone(git_path: str, disk_path: Union[Path, str]) -> Optional[bool]: +def git_clone( + git_path: str, + disk_path: Union[Path, str], + options: Optional[List[str]] = None, +) -> Optional[bool]: """Clone a git repository to a disk path.""" if not Path(disk_path).exists(): return run_command( - ["git", "clone", git_path, str(disk_path)], + ["git", "clone", git_path, str(disk_path), *(options or [])], action="clone", name=f"{git_path} to {disk_path}", try_emoji="📥", @@ -203,14 +207,16 @@ def convert_toml_to_requirements_with_poetry( @contextmanager -def import_repository(git_path: str, disk_path: str): +def import_repository( + git_path: str, disk_path: str, options: Optional[List[str]] = None +): """ Import a repository from git. The repository will be cloned to disk_path. The dependencies will be installed from pyproject.toml or requirements.txt. """ # Clone the repository - git_clone(git_path=git_path, disk_path=disk_path) + git_clone(git_path=git_path, disk_path=disk_path, options=options) # Add the repository to the path so that it can be imported sys.path.insert(0, str(disk_path)) diff --git a/llama_api/utils/llama_cpp.py b/llama_api/utils/llama_cpp.py index 75eb0c2..fc16ef2 100644 --- a/llama_api/utils/llama_cpp.py +++ b/llama_api/utils/llama_cpp.py @@ -13,10 +13,15 @@ # You can set the CMAKE_ARGS environment variable to change the cmake args. # cuBLAS is default to ON if CUDA is installed. # CPU inference is default if CUDA is not installed. -if get_cuda_version() is None: - CMAKE_ARGS: str = "-DBUILD_SHARED_LIBS=ON" +METAL_ARGS = "-DBUILD_SHARED_LIBS=ON -DLLAMA_METAL=ON" +CUBLAS_ARGS = "-DBUILD_SHARED_LIBS=ON -DLLAMA_CUBLAS=ON" +CPU_ARGS = "-DBUILD_SHARED_LIBS=ON" +if sys.platform == "darwin": + CMAKE_ARGS: str = METAL_ARGS +elif get_cuda_version() is None: + CMAKE_ARGS: str = CPU_ARGS else: - CMAKE_ARGS = "-DBUILD_SHARED_LIBS=ON -DLLAMA_CUBLAS=ON" + CMAKE_ARGS: str = CUBLAS_ARGS LIB_BASE_NAME: str = "llama" REPOSITORY_FOLDER: str = "repositories" @@ -60,7 +65,7 @@ def _temporary_change_cwd(path): chdir(prev_cwd) -def _git_clone() -> None: +def _git_clone_if_not_exists() -> None: # Clone the git repos if they don't exist for clone_path, clone_command in GIT_CLONES.items(): if not clone_path.exists() or not any(clone_path.iterdir()): @@ -131,6 +136,10 @@ def _cmake_args_to_make_args(cmake_args: List[str]) -> List[str]: # capitalize all letters cmake_arg = cmake_arg.upper() + # skip the `BUILD_SHARED_LIBS` flag + if "BUILD_SHARED_LIBS" in cmake_arg: + continue + # replace `ON` with `1` and `OFF` with `0` cmake_arg = cmake_arg.replace("=ON", "=1").replace("=OFF", "=0") @@ -147,15 +156,9 @@ def _make(make_dir: Path, make_args: List[str], target_dir: Path) -> None: # Run make to build the shared lib # Build the shared lib - run_command( - ["make", *make_args], - action="build", - name="llama.cpp shared lib", - cwd=make_dir, - ) for lib in _get_libs(): run_command( - ["make", lib], + ["make", *make_args, lib], action="build", name="llama.cpp shared lib", cwd=make_dir, @@ -199,20 +202,24 @@ def _cmake(cmake_dir: Path, cmake_args: List[str], target_dir: Path) -> None: def build_shared_lib( - logger: Optional[Logger] = None, - force_cmake: bool = bool(environ.get("FORCE_CMAKE", False)), + logger: Optional[Logger] = None, force_cuda: bool = False ) -> None: """Build the shared library for llama.cpp""" + global CMAKE_ARGS + if force_cuda or bool(environ.get("FORCE_CUDA", False)): + assert get_cuda_version() is not None, "CUDA is not available" + CMAKE_ARGS = CUBLAS_ARGS + if logger is None: logger = getLogger(__name__) logger.setLevel("INFO") # Git clone llama-cpp-python and llama.cpp - _git_clone() + _git_clone_if_not_exists() # Build the libs if they don't exist or if `force_cmake` is True - if force_cmake or not any( + if bool(environ.get("FORCE_CMAKE", False)) or not any( lib_path.exists() for lib_path in _get_lib_paths(MODULE_PATH) ): # Build the libs diff --git a/main.py b/main.py index 9b8d7f7..4215d49 100644 --- a/main.py +++ b/main.py @@ -70,7 +70,8 @@ skip_compile=args.skip_compile, environs={ "LLAMA_API_MAX_WORKERS": str(args.max_workers), - "LLAMA_API_XFORMERS": "1" if args.xformers else "0", + "LLAMA_API_XFORMERS": "1" if args.xformers else "", "LLAMA_API_API_KEY": args.api_key or "", + "FORCE_CUDA": "1" if args.force_cuda else "", }, ) From 0775d11677c4de1daccc2465c8994e651bf1b63d Mon Sep 17 00:00:00 2001 From: c0sogi Date: Mon, 14 Aug 2023 23:07:12 +0900 Subject: [PATCH 07/15] Added chat logger --- docker-compose.persistent.yml | 2 +- docker-compose.yml | 3 +- llama_api/modules/base.py | 13 ++++- llama_api/modules/exllama.py | 69 +++++++++++++++----------- llama_api/schemas/models.py | 4 +- llama_api/server/routers/v1.py | 89 ++++++++++++++++++++++++++++++++-- llama_api/utils/logger.py | 16 +++--- 7 files changed, 152 insertions(+), 44 deletions(-) diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml index 971d8de..08605d3 100644 --- a/docker-compose.persistent.yml +++ b/docker-compose.persistent.yml @@ -5,7 +5,7 @@ volumes: services: llama-api: - image: cosogi/llama-api:230813 + image: cosogi/llama-api:230814 entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 diff --git a/docker-compose.yml b/docker-compose.yml index 0ad51af..a914dfa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,9 +2,10 @@ version: '3' services: llama-api: - image: cosogi/llama-api:230813 + image: cosogi/llama-api:230814 entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: + - FORCE_CUDA=1 - LLAMA_API_MAX_WORKERS=1 - LLAMA_API_API_KEY= volumes: diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py index 09b8291..61b1e7e 100644 --- a/llama_api/modules/base.py +++ b/llama_api/modules/base.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from dataclasses import asdict, dataclass +from pathlib import Path from typing import Any, Iterator, List, TypeVar -from llama_api.mixins.logits import LogitsMixin - from ..mixins.interrupt import InterruptMixin +from ..mixins.logits import LogitsMixin from ..mixins.prompt_utils import PromptUtilsMixin from ..schemas.api import ( APIChatMessage, @@ -29,6 +29,10 @@ class BaseLLMModel: def asdict(self) -> dict: return asdict(self) + @property + def model_path_resolved(self) -> str: + return self.model_path + class BaseCompletionGenerator( ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin @@ -86,6 +90,11 @@ def decode(self, ids: List[int], **kwargs: Any) -> str: def llm_model(self) -> "BaseLLMModel": """The LLM model used by this generator.""" + @property + def model_name(self) -> str: + """Identifier for the model used by this generator.""" + return Path(self.llm_model.model_path_resolved).stem + class BaseEmbeddingGenerator(ABC): @abstractmethod diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index 293682d..7d9e960 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -1,5 +1,6 @@ """Wrapper for exllama to generate text completions.""" # flake8: noqa +from gc import collect from os import environ from ..utils.logger import ApiLogger @@ -28,6 +29,7 @@ ) from torch import IntTensor, Tensor, cuda, version +from torch.cuda import empty_cache from torch.nn.functional import log_softmax from ..logits.base import BaseLogitProcessor @@ -255,7 +257,7 @@ def _generator( logit_processors = ( [ processor - for processor in BaseCompletionGenerator.get_logit_processors( + for processor in cg.get_logit_processors( settings=settings, encoder=cg.encode, ) @@ -332,12 +334,18 @@ def _generate_text_with_streaming( return_mask=True, ) generator.gen_begin(ids, mask=mask) + + prompt_tokens = ids.shape[-1] + context_window = cg.llm_model.max_total_tokens cg.raise_for_token_limit( - prompt_tokens=ids.shape[-1], - context_window=cg.llm_model.max_total_tokens, + prompt_tokens=prompt_tokens, context_window=context_window + ) + settings.max_tokens = min( + settings.max_tokens, context_window - prompt_tokens ) + yield from _generator( - cg, cfg_mask=mask, settings=settings, stops=stops + cg, settings=settings, cfg_mask=mask, stops=stops ) except Exception as e: logger.exception(e) @@ -406,8 +414,8 @@ def from_pretrained( def generate_completion_with_streaming( self, prompt: str, settings: "TextGenerationSettings" ) -> Iterator["CompletionChunk"]: - completion_id: str = settings.completion_id - model_path: str = str(self.config.model_path) + completion_id = settings.completion_id + model = self.model_name last_token: Optional[str] = None generated_text: str = "" for token in _generate_text_with_streaming( @@ -417,14 +425,14 @@ def generate_completion_with_streaming( if last_token is not None: yield make_completion_chunk( id=completion_id, - model=model_path, + model=model, text=last_token, finish_reason=None, ) last_token = token yield make_completion_chunk( id=completion_id, - model=model_path, + model=model, text=last_token if last_token is not None else "", finish_reason="length" if self._completion_status.get( @@ -438,19 +446,19 @@ def generate_completion_with_streaming( def generate_completion( self, prompt: str, settings: "TextGenerationSettings" ) -> "Completion": - completion_id: str = settings.completion_id - generated_text: str = "".join( + completion_id = settings.completion_id + generated_text = "".join( _generate_text_with_streaming( self, prompt=prompt, settings=settings ) ) - n_prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1] - n_completion_tokens: int = self._completion_status.get( + n_prompt_tokens = _encode(self.tokenizer, prompt).shape[1] + n_completion_tokens = self._completion_status.get( completion_id, _encode(self.tokenizer, generated_text).shape[1] ) return make_completion( id=completion_id, - model=str(self.config.model_path), + model=self.model_name, text=generated_text, prompt_tokens=n_prompt_tokens, completion_tokens=n_completion_tokens, @@ -464,9 +472,9 @@ def generate_chat_completion_with_streaming( messages: List["APIChatMessage"], settings: "TextGenerationSettings", ) -> Iterator["ChatCompletionChunk"]: - completion_id: str = settings.completion_id + completion_id = settings.completion_id prompt = self.convert_messages_into_prompt(messages, settings=settings) - model_path: str = str(self.config.model_path) + model = self.model_name last_token: Optional[str] = None generated_text: str = "" for token in _generate_text_with_streaming( @@ -476,14 +484,14 @@ def generate_chat_completion_with_streaming( if last_token is not None: yield make_chat_completion_chunk( id=completion_id, - model=model_path, + model=model, content=last_token, finish_reason=None, ) last_token = token yield make_chat_completion_chunk( id=completion_id, - model=model_path, + model=model, content=last_token if last_token is not None else "", finish_reason="length" if self._completion_status.get( @@ -498,20 +506,20 @@ def generate_chat_completion( messages: List["APIChatMessage"], settings: "TextGenerationSettings", ) -> "ChatCompletion": - completion_id: str = settings.completion_id + completion_id = settings.completion_id prompt = self.convert_messages_into_prompt(messages, settings=settings) - generated_text: str = "".join( + generated_text = "".join( _generate_text_with_streaming( self, prompt=prompt, settings=settings ) ) - prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1] - completion_tokens: int = self._completion_status.get( + prompt_tokens = _encode(self.tokenizer, prompt).shape[1] + completion_tokens = self._completion_status.get( completion_id, _encode(self.tokenizer, generated_text).shape[1] ) return make_chat_completion( id=completion_id, - model=str(self.config.model_path), + model=self.model_name, content=generated_text, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -529,11 +537,6 @@ def decode(self, ids: List[int], **kwargs) -> str: return str(self._tokenizer.decode(IntTensor(ids))) def __del__(self) -> None: - if self._model is not None: - self._model.free_unmanaged() - del self._model - self._model = None - logger.info("🗑️ ExllamaCompletionGenerator model deleted") if self._tokenizer is not None: getattr(self._tokenizer, "__del__", lambda: None)() del self._tokenizer @@ -544,6 +547,18 @@ def __del__(self) -> None: del self._cache self._cache = None logger.info("🗑️ ExllamaCompletionGenerator cache deleted") + if self._generator is not None: + getattr(self._generator, "__del__", lambda: None)() + del self._generator + self._generator = None + logger.info("🗑️ ExllamaCompletionGenerator generator deleted") + if self._model is not None: + self._model.free_unmanaged() + del self._model + self._model = None + logger.info("🗑️ ExllamaCompletionGenerator model deleted") + collect() + empty_cache() @overload diff --git a/llama_api/schemas/models.py b/llama_api/schemas/models.py index e622438..2bc1438 100644 --- a/llama_api/schemas/models.py +++ b/llama_api/schemas/models.py @@ -94,7 +94,7 @@ class LlamaCppModel(BaseLLMModel): mul_mat_q: Optional[bool] = None # TEMPORARY @cached_property - def model_path_resolved(self): + def model_path_resolved(self) -> str: return path_resolver( self.model_path, default_relative_directory="models/ggml", @@ -151,7 +151,7 @@ class ExllamaModel(BaseLLMModel): concurrent_streams: bool = False @cached_property - def model_path_resolved(self): + def model_path_resolved(self) -> str: return path_resolver( self.model_path, default_relative_directory="models/gptq", diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 9e8b7f7..2d6792c 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -35,12 +35,14 @@ from anyio.streams.memory import MemoryObjectSendStream from fastapi import APIRouter, Request from fastapi.concurrency import iterate_in_threadpool, run_in_threadpool -from orjson import dumps +from orjson import OPT_INDENT_2, dumps from sse_starlette.sse import EventSourceResponse from ...schemas.api import ( ChatCompletion, + ChatCompletionChunk, Completion, + CompletionChunk, CreateChatCompletionRequest, CreateCompletionRequest, CreateEmbeddingRequest, @@ -53,7 +55,7 @@ run_in_processpool_with_wix, ) from ...utils.errors import RouteErrorHandler -from ...utils.logger import ApiLogger +from ...utils.logger import ApiLogger, LoggingConfig from ..pools.llama import ( generate_completion, generate_completion_chunks, @@ -61,6 +63,14 @@ get_model_names, ) +chat_logger = ApiLogger( + "", + logging_config=LoggingConfig( + console_log_level=100, + file_log_name="./logs/chat.log", + color=False, + ), +) logger = ApiLogger(__name__) router = APIRouter(prefix="/v1", route_class=RouteErrorHandler) T = TypeVar("T") @@ -73,6 +83,7 @@ class TaskStatus(TypedDict): started_at: float interrupted: bool embedding_chunks: Optional[int] + generated_text: str @dataclass @@ -149,6 +160,24 @@ async def get_wix_with_semaphore( raise LookupError("No available wix") +def get_text_from_completion( + completion: Union[Completion, ChatCompletion] +) -> str: + """Get the generated text from a completion""" + if "text" in completion["choices"][0]: + return completion["choices"][0]["text"] + return completion["choices"][0]["message"]["content"] + + +def get_text_from_chunk( + chunk: Union[CompletionChunk, ChatCompletionChunk] +) -> str: + """Get the generated text from a completion chunk""" + if "text" in chunk["choices"][0]: + return chunk["choices"][0]["text"] + return chunk["choices"][0]["delta"].get("content", "") + + async def get_event_publisher( request: Request, body: Union[ @@ -158,7 +187,7 @@ async def get_event_publisher( inner_send_chan: MemoryObjectSendStream, task: "Task[None]", interrupt_signal: Event, - iterator: Iterator, + iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]], ) -> None: """Publish Server-Sent-Events (SSE) to the client""" with task_manager( @@ -170,6 +199,7 @@ async def get_event_publisher( try: async for chunk in iterate_in_threadpool(iterator): task_status["completion_tokens"] += 1 + task_status["generated_text"] += get_text_from_chunk(chunk) await inner_send_chan.send( b"data: " + dumps(chunk) + b"\n\n" ) @@ -198,6 +228,51 @@ def get_streaming_iterator( yield validate_item_type(gen, type=dict) +def log_request( + body: Union[ + CreateChatCompletionRequest, + CreateCompletionRequest, + CreateEmbeddingRequest, + ], + task_status: TaskStatus, +) -> None: + body_without_prompt = body.model_dump( + exclude={"prompt", "messages", "input"}, + exclude_defaults=True, + exclude_unset=True, + exclude_none=True, + ) + if isinstance(body, CreateChatCompletionRequest): + chat_log = { + "request": body_without_prompt, + "chat": [ + body.messages[i].model_dump(exclude_none=True) + for i in range(len(body.messages)) + ] + + [ + { + "role": "assistant", + "content": task_status["generated_text"], + } + ], + } + elif isinstance(body, CreateCompletionRequest): + chat_log = { + "request": body_without_prompt, + "prompt": { + "user": body.prompt, + "assistant": task_status["generated_text"], + }, + } + else: + chat_log = { + "request": body_without_prompt, + "input": body.input, + "embedding": task_status["embedding_chunks"], + } + chat_logger.info(dumps(chat_log, option=OPT_INDENT_2).decode()) + + @contextmanager def task_manager( body: Union[ @@ -215,10 +290,10 @@ def task_manager( started_at=time(), interrupted=False, embedding_chunks=None, + generated_text="", ) try: logger.info(f"🦙 Handling request of {body.model}...") - logger.debug(f"🦙 Request body: {body}") yield task_status finally: # Cancel the producer task and set event, @@ -246,6 +321,7 @@ def task_manager( logger.info( f"🦙 [{status} for {body.model}]: ({' | '.join(basic_messages)})" ) + log_request(body=body, task_status=task_status) async def create_chat_completion_or_completion( @@ -285,7 +361,7 @@ async def create_chat_completion_or_completion( inner_send_chan=send_chan, task=task, interrupt_signal=interrupt_signal, - iterator=get_streaming_iterator( + iterator=get_streaming_iterator( # type: ignore queue=queue, first_response=validate_item_type( await run_in_threadpool(queue.get), type=dict @@ -308,6 +384,9 @@ async def create_chat_completion_or_completion( task_status["completion_tokens"] = completion["usage"][ "completion_tokens" ] + task_status["generated_text"] = get_text_from_completion( + completion + ) return completion diff --git a/llama_api/utils/logger.py b/llama_api/utils/logger.py index 7cfe2bf..dbefbbb 100644 --- a/llama_api/utils/logger.py +++ b/llama_api/utils/logger.py @@ -1,8 +1,8 @@ """Logger module for the API""" import logging -import os from dataclasses import dataclass +from pathlib import Path from typing import Dict, Optional from .colorama import Fore, Style @@ -15,6 +15,7 @@ class LoggingConfig: file_log_level: Optional[int] = logging.DEBUG file_log_name: Optional[str] = "./logs/debug.log" logging_format: str = "[%(asctime)s] %(name)s:%(levelname)s - %(message)s" + color: bool = True class ColoredFormatter(logging.Formatter): @@ -52,7 +53,11 @@ def __init__( self, name: str, logging_config: LoggingConfig = LoggingConfig() ) -> None: super().__init__(name=name, level=logging_config.logger_level) - formatter = ColoredFormatter(logging_config.logging_format) + formatter = ( + ColoredFormatter(logging_config.logging_format) + if logging_config.color + else logging.Formatter(logging_config.logging_format) + ) console = logging.StreamHandler() console.setLevel(logging_config.console_log_level) @@ -62,10 +67,9 @@ def __init__( logging_config.file_log_name is not None and logging_config.file_log_level is not None ): - if not os.path.exists( - os.path.dirname(logging_config.file_log_name) - ): - os.makedirs(os.path.dirname(logging_config.file_log_name)) + Path(logging_config.file_log_name).parent.mkdir( + parents=True, exist_ok=True + ) file_handler = logging.FileHandler( filename=logging_config.file_log_name, mode="a", From 778c0bd13075c4739ed3095aab06b7e70c508fc9 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Mon, 14 Aug 2023 23:26:42 +0900 Subject: [PATCH 08/15] Fixed bug: llama.cpp context tokens --- llama_api/modules/llama_cpp.py | 1 + llama_api/server/routers/v1.py | 2 ++ llama_api/utils/errors.py | 2 +- main.py | 8 ++++++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index 7de518d..d88e3e4 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -130,6 +130,7 @@ def from_pretrained( # Hacky way to pass arguments to older versions of llama-cpp-python if key in signature(llama_cpp.Llama.__init__).parameters.keys() } + kwargs["n_ctx"] = llm_model.max_total_tokens kwargs["model_path"] = llm_model.model_path_resolved kwargs["verbose"] = llm_model.verbose and llm_model.echo client = llama_cpp.Llama(**kwargs) diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 2d6792c..a7b2713 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -410,6 +410,8 @@ async def create_completion(request: Request, body: CreateCompletionRequest): async def create_embedding( body: CreateEmbeddingRequest, ) -> Embedding: + if not environ.get("LLAMA_API_EMBEDDINGS"): + raise PermissionError("Embeddings endpoint is disabled") assert body.model is not None, "Model is required" async with get_wix_with_semaphore(body.model) as wix: queue, interrupt_signal = get_queue_and_event() diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index dc7f4e2..3949d51 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -206,7 +206,7 @@ async def custom_route_handler(self, request: Request) -> Response: {"error": error_response}, status_code=401, ) - if authorization != self.authorization: + if authorization.lower() != self.authorization.lower(): api_key = authorization[len("Bearer ") :] # noqa: E203 error_response = ErrorResponse( message=( diff --git a/main.py b/main.py index 4215d49..7f8bd16 100644 --- a/main.py +++ b/main.py @@ -19,11 +19,13 @@ help="Maximum number of process workers to run; default is 1", ) parser.add_argument( + "-i", "--install-pkgs", action="store_true", help="Install all required packages before running the server", ) parser.add_argument( + "-c", "--force-cuda", action="store_true", help=( @@ -59,6 +61,11 @@ action="store_true", help="Apply xformers' memory-efficient optimizations", ) + parser.add_argument( + "--disable-embeddings", + action="store_true", + help="Disable embeddings endpoint", + ) args = parser.parse_args() run( @@ -73,5 +80,6 @@ "LLAMA_API_XFORMERS": "1" if args.xformers else "", "LLAMA_API_API_KEY": args.api_key or "", "FORCE_CUDA": "1" if args.force_cuda else "", + "LLAMA_API_EMBEDDINGS": "1" if not args.disable_embeddings else "", }, ) From fcb0d58f8d726bc7e4c6c149b18c1b95d88c4f01 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Mon, 14 Aug 2023 23:29:48 +0900 Subject: [PATCH 09/15] Removed assertion: api key should start with "sk-" --- llama_api/utils/errors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index 3949d51..e647c79 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -133,8 +133,6 @@ def authorization(self) -> Optional[str]: """API key for authentication""" if self.api_key is None: return None - if not self.api_key.startswith("sk-"): - self.api_key = f"sk-{self.api_key}" return f"Bearer {self.api_key}" def error_message_wrapper( From b85de0e27dab3ad34cd6d4c574780d413a443068 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 15 Aug 2023 21:14:12 +0900 Subject: [PATCH 10/15] Improved worker load balancing --- llama_api/server/routers/v1.py | 116 ++++++++++++++++----------------- 1 file changed, 55 insertions(+), 61 deletions(-) diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index a7b2713..9a5ec15 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -8,6 +8,7 @@ from functools import partial from os import environ from queue import Queue +from random import choice from threading import Event from time import time from typing import ( @@ -66,13 +67,13 @@ chat_logger = ApiLogger( "", logging_config=LoggingConfig( - console_log_level=100, - file_log_name="./logs/chat.log", - color=False, + console_log_level=100, file_log_name="./logs/chat.log", color=False ), ) logger = ApiLogger(__name__) router = APIRouter(prefix="/v1", route_class=RouteErrorHandler) +max_workers = int(environ.get("LLAMA_API_MAX_WORKERS", 1)) +max_semaphores = int(environ.get("LLAMA_API_MAX_SEMAPHORES", 1)) T = TypeVar("T") @@ -90,18 +91,58 @@ class TaskStatus(TypedDict): class WixMetadata: """Worker index (wix) metadata""" - key: Optional[str] = None - semaphore: Semaphore = field(default_factory=lambda: Semaphore(1)) + wix: int + processed_key: Optional[str] = None + semaphore: Semaphore = field( + default_factory=lambda: Semaphore(max_semaphores) + ) # Worker index (wix) is used to keep track of which worker is currently # processing a request. This is used to prevent multiple requests from # creating multiple completion generators at the same time. -wixs: Tuple[WixMetadata] = tuple( - WixMetadata() for _ in range(int(environ.get("LLAMA_API_MAX_WORKERS", 1))) +wix_metas: Tuple[WixMetadata] = tuple( + WixMetadata(wix) for wix in range(max_workers) ) +def get_worker_rank(meta: WixMetadata, request_key: Optional[str]) -> int: + """Get the entry rank for the worker index (wix) metadata. + Lower rank means higher priority of the worker to process the request.""" + global max_semaphores + if request_key == meta.processed_key: + # If the key is the same (worker is processing the same model) + return -2 # return the highest priority + if request_key is None or meta.processed_key is None: + # If not requesting a specific model or the worker is not processing + return -1 # return the second highest priority + return ( + max_semaphores - meta.semaphore.value + ) # return the number of slots in use + + +@asynccontextmanager +async def get_wix_with_semaphore( + request: Request, + request_key: Optional[str] = None, +) -> AsyncGenerator[int, None]: + """Get the worker index (wix) for the key and acquire the semaphore""" + global wix_metas + worker_ranks = [ + get_worker_rank(wix_meta, request_key) for wix_meta in wix_metas + ] + min_rank = min(worker_ranks) + candidates = [i for i, rank in enumerate(worker_ranks) if rank == min_rank] + if not candidates: + raise LookupError("No available wix") + wix_meta = wix_metas[choice(candidates)] + async with wix_meta.semaphore: + if await request.is_disconnected(): + raise get_cancelled_exc_class()() + wix_meta.processed_key = request_key + yield wix_meta.wix + + def validate_item_type(item: Any, type: Type[T]) -> T: """Validate that the item is of the correct type""" if isinstance(item, Exception): @@ -113,53 +154,6 @@ def validate_item_type(item: Any, type: Type[T]) -> T: return item -@asynccontextmanager -async def get_wix_with_semaphore( - key: Optional[str] = None, -) -> AsyncGenerator[int, None]: - """Get the worker index (wix) for the key and acquire the semaphore""" - if key is None: - # Find the first available slot - for wix, wix_metadata in enumerate(wixs): - if wix_metadata.semaphore.value: - async with wix_metadata.semaphore: - wix_metadata.key = key - yield wix - return - else: - # Get the worker index (wix) for the key - for wix, wix_metadata in enumerate(wixs): - if wix_metadata.key == key: - async with wix_metadata.semaphore: - yield wix - return - - # If the key is not in the wixs, find the first empty slot - for wix, wix_metadata in enumerate(wixs): - if wix_metadata.key is None: - async with wix_metadata.semaphore: - wix_metadata.key = key - yield wix - return - - # If there are no empty slot, find available slot - for wix, wix_metadata in enumerate(wixs): - if wix_metadata.semaphore.value: - async with wix_metadata.semaphore: - wix_metadata.key = key - yield wix - return - - # If there are no available slot, wait for one to become available - for wix, wix_metadata in enumerate(wixs): - async with wix_metadata.semaphore: - wix_metadata.key = key - yield wix - return - - raise LookupError("No available wix") - - def get_text_from_completion( completion: Union[Completion, ChatCompletion] ) -> str: @@ -228,7 +222,7 @@ def get_streaming_iterator( yield validate_item_type(gen, type=dict) -def log_request( +def log_request_and_response( body: Union[ CreateChatCompletionRequest, CreateCompletionRequest, @@ -321,7 +315,7 @@ def task_manager( logger.info( f"🦙 [{status} for {body.model}]: ({' | '.join(basic_messages)})" ) - log_request(body=body, task_status=task_status) + log_request_and_response(body=body, task_status=task_status) async def create_chat_completion_or_completion( @@ -332,7 +326,7 @@ async def create_chat_completion_or_completion( If the body is a chat completion, then create a chat completion. If the body is a completion, then create a completion. If streaming is enabled, then return an EventSourceResponse.""" - async with get_wix_with_semaphore(body.model) as wix: + async with get_wix_with_semaphore(request, body.model) as wix: queue, interrupt_signal = get_queue_and_event() producer: Callable[ [ @@ -408,12 +402,12 @@ async def create_completion(request: Request, body: CreateCompletionRequest): @router.post("/embeddings") async def create_embedding( - body: CreateEmbeddingRequest, + request: Request, body: CreateEmbeddingRequest ) -> Embedding: if not environ.get("LLAMA_API_EMBEDDINGS"): raise PermissionError("Embeddings endpoint is disabled") assert body.model is not None, "Model is required" - async with get_wix_with_semaphore(body.model) as wix: + async with get_wix_with_semaphore(request, body.model) as wix: queue, interrupt_signal = get_queue_and_event() producer: Callable[ [CreateEmbeddingRequest, Queue, Event], @@ -441,8 +435,8 @@ async def create_embedding( @router.get("/models") -async def get_models() -> ModelList: - async with get_wix_with_semaphore() as wix: +async def get_models(request: Request) -> ModelList: + async with get_wix_with_semaphore(request) as wix: return ModelList( object="list", data=[ From 6b2e37fe67ac490296de1800a1f5d9592b10677a Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 15 Aug 2023 21:14:44 +0900 Subject: [PATCH 11/15] bump dependencies --- poetry.lock | 121 ++++++++++++++++++++++++++++++++--------------- pyproject.toml | 5 +- requirements.txt | 15 +++--- 3 files changed, 94 insertions(+), 47 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6e0f091..a969ac9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -362,28 +362,28 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} [[package]] name = "cmake" -version = "3.27.1" +version = "3.27.2" description = "CMake is an open-source, cross-platform family of tools designed to build, test and package software" optional = false python-versions = "*" files = [ - {file = "cmake-3.27.1-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:c62c5a6d42e68eb955fc321f7bc84290e4c4771ee7e5301c2eaa9586c874fd8e"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2010_i686.manylinux_2_12_i686.whl", hash = "sha256:18ef1c579cb4c94ece6bbb7c7f3e0170b078bf787f0a372194f0921e79f6098c"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:ad3aca0d94abe6313a7b1c65b8b3d7eb3158786fd1dd6a9f8c42f82850fb974c"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:50bfe69d369a61eb63e5b8af76a2383cf312d1e8449bd797d563f6c62809d317"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:7fb6d9183b90d4cc4db7b022aa7c9ef3431d281aea29ca259de7199bc75b7e09"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:2583464302ecc287619578627e26962386a41a98bbf1fb4c8c90d600ec1a1be5"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:7a5431c7ca0b7145b857dd0eab26f4f9ec42661bb67afa6d437b3e48532b8e3a"}, - {file = "cmake-3.27.1-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1fb6d882bbd7e77fad206dfdbcaf880f4bcd7e8d0c23b37058ee155715bd19ed"}, - {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:cee7dd0bcc5bd14d94ecdbbf9883b17f3001adc5f696b7d8eba0482354e5e017"}, - {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_i686.whl", hash = "sha256:82a6f57449e7bf9b510ed82b29982e4eec8b5c5e80a51208368dc1aa58b8181b"}, - {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_ppc64le.whl", hash = "sha256:7052bb12c3492083169269fee7c7a11c053cae35949346b12d2998b971602b78"}, - {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_s390x.whl", hash = "sha256:482e7018fc8d9bc98e7f30b5071c021ca0e27b131dd61900395abfd768c3fe29"}, - {file = "cmake-3.27.1-py2.py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:a112dd298b8ac598fef3653dff1592ba4c0f9bf7fe75b77ab44a6edfcceb96d4"}, - {file = "cmake-3.27.1-py2.py3-none-win32.whl", hash = "sha256:b9d68783ea01775d6d4ea220d3b4e90d5e287cf9a1db09c5a9b78c7748e1c3d0"}, - {file = "cmake-3.27.1-py2.py3-none-win_amd64.whl", hash = "sha256:628f75286475b89d6566db62c0869de5f0a07ad9bba10bebe6a48012fa1ee777"}, - {file = "cmake-3.27.1-py2.py3-none-win_arm64.whl", hash = "sha256:ee7a47e37a29b8124d9125a8c390fb94822a2695d80151560004d4f4f78c0ad7"}, - {file = "cmake-3.27.1.tar.gz", hash = "sha256:7ee6af09b2b575a491483b72927ee7e4beb59e7fb86e6d905a7027607a3f367e"}, + {file = "cmake-3.27.2-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:96ac856c4d6b2104408848f0005a8ab2229d4135b171ea9a03e8c33039ede420"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2010_i686.manylinux_2_12_i686.whl", hash = "sha256:11fe6129d07982721c5965fd804a4056b8c6e9c4f482ac9e0fe41bb3abc1ab5f"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:f0c64e89e2ea59592980c4fe3821d712fee0e74cf87c2aaec5b3ab9aa809a57c"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ca7650477dff2a1138776b28b79c0e99127be733d3978922e8f87b56a433eed6"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ab2e40fe09e76a7ef67da2bbbf7a4cd1f52db4f1c7b6ccdda2539f918830343a"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:980ee19f12c808cb8ddb56fdcee832501a9f9631799d8b4fc625c0a0b5fb4c55"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:115d30ca0760e3861d9ad6b3288cd11ee72a785b81227da0c1765d3b84e2c009"}, + {file = "cmake-3.27.2-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efc338c939d6d435890a52458a260bf0942bd8392b648d7532a72c1ec0764e18"}, + {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:7f7438c60ccc01765b67abfb1797787c3b9459d500a804ed70a4cc181bc02204"}, + {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_i686.whl", hash = "sha256:294f008734267e0eee1574ad1b911bed137bc907ab19d60a618dab4615aa1fca"}, + {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_ppc64le.whl", hash = "sha256:197a34dc62ee149ced343545fac67e5a30b93fda65250b065726f86ce92bdada"}, + {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_s390x.whl", hash = "sha256:afb46ad883b174fb64347802ba5878423551dbd5847bb64669c39a5957c06eb7"}, + {file = "cmake-3.27.2-py2.py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:83611ffd155e270a6b13bbf0cfd4e8688ebda634f448aa2e3734006c745bf33f"}, + {file = "cmake-3.27.2-py2.py3-none-win32.whl", hash = "sha256:53e12deb893da935e236f93accd47dbe2806620cd7654986234dc4487cc49652"}, + {file = "cmake-3.27.2-py2.py3-none-win_amd64.whl", hash = "sha256:611f9722c68c40352d38a6c01960ab038c3d0419e7aee3bf18f95b23031e0dfe"}, + {file = "cmake-3.27.2-py2.py3-none-win_arm64.whl", hash = "sha256:30620326b51ac2ce0d8f476747af6367a7ea21075c4d065fad9443904b07476a"}, + {file = "cmake-3.27.2.tar.gz", hash = "sha256:7cd6e2d7d5a1125f8c26c4f65214f8c942e3f276f98c16cb62ae382c35609f25"}, ] [package.extras] @@ -565,13 +565,13 @@ pgp = ["gpg"] [[package]] name = "exceptiongroup" -version = "1.1.2" +version = "1.1.3" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"}, - {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"}, + {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, + {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, ] [package.extras] @@ -579,17 +579,17 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.100.1" +version = "0.101.1" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.7" files = [ - {file = "fastapi-0.100.1-py3-none-any.whl", hash = "sha256:ec6dd52bfc4eff3063cfcd0713b43c87640fefb2687bbbe3d8a08d94049cdf32"}, - {file = "fastapi-0.100.1.tar.gz", hash = "sha256:522700d7a469e4a973d92321ab93312448fbe20fca9c8da97effc7e7bc56df23"}, + {file = "fastapi-0.101.1-py3-none-any.whl", hash = "sha256:aef5f8676eb1b8389952e1fe734abe20f04b71f6936afcc53b320ba79b686a4b"}, + {file = "fastapi-0.101.1.tar.gz", hash = "sha256:7b32000d14ca9992f7461117b81e4ef9ff0c07936af641b4fe40e67d5f9d63cb"}, ] [package.dependencies] -pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<3.0.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" starlette = ">=0.27.0,<0.28.0" typing-extensions = ">=4.5.0" @@ -1855,13 +1855,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydantic-settings" -version = "2.0.2" +version = "2.0.3" description = "Settings management using Pydantic" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic_settings-2.0.2-py3-none-any.whl", hash = "sha256:6183a2abeab465d5a3ab69758e9a22d38b0cc2ba193f0b85f6971a252ea630f6"}, - {file = "pydantic_settings-2.0.2.tar.gz", hash = "sha256:342337fff50b23585e807a86dec85037900972364435c55c2fc00d16ff080539"}, + {file = "pydantic_settings-2.0.3-py3-none-any.whl", hash = "sha256:ddd907b066622bd67603b75e2ff791875540dc485b7307c4fffc015719da8625"}, + {file = "pydantic_settings-2.0.3.tar.gz", hash = "sha256:962dc3672495aad6ae96a4390fac7e593591e144625e5112d359f8f67fb75945"}, ] [package.dependencies] @@ -2640,13 +2640,13 @@ files = [ [[package]] name = "sse-starlette" -version = "1.6.1" +version = "1.6.5" description = "\"SSE plugin for Starlette\"" optional = false python-versions = ">=3.8" files = [ - {file = "sse-starlette-1.6.1.tar.gz", hash = "sha256:6208af2bd7d0887c92f1379da14bd1f4db56bd1274cc5d36670c683d2aa1de6a"}, - {file = "sse_starlette-1.6.1-py3-none-any.whl", hash = "sha256:d8f18f1c633e355afe61cc5e9c92eea85badcb8b2d56ec8cfb0a006994aa55da"}, + {file = "sse-starlette-1.6.5.tar.gz", hash = "sha256:819f2c421fb37067380fe3dcaba246c476b02651b7bb7601099a378ad802a0ac"}, + {file = "sse_starlette-1.6.5-py3-none-any.whl", hash = "sha256:68b6b7eb49be0c72a2af80a055994c13afcaa4761b29226beb208f954c25a642"}, ] [package.dependencies] @@ -2684,6 +2684,51 @@ files = [ numpy = ">=1.12.0" protobuf = ">=3.19.6" +[[package]] +name = "tiktoken" +version = "0.4.0" +description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tiktoken-0.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:176cad7f053d2cc82ce7e2a7c883ccc6971840a4b5276740d0b732a2b2011f8a"}, + {file = "tiktoken-0.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:450d504892b3ac80207700266ee87c932df8efea54e05cefe8613edc963c1285"}, + {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00d662de1e7986d129139faf15e6a6ee7665ee103440769b8dedf3e7ba6ac37f"}, + {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5727d852ead18b7927b8adf558a6f913a15c7766725b23dbe21d22e243041b28"}, + {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c06cd92b09eb0404cedce3702fa866bf0d00e399439dad3f10288ddc31045422"}, + {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9ec161e40ed44e4210d3b31e2ff426b4a55e8254f1023e5d2595cb60044f8ea6"}, + {file = "tiktoken-0.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:1e8fa13cf9889d2c928b9e258e9dbbbf88ab02016e4236aae76e3b4f82dd8288"}, + {file = "tiktoken-0.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb2341836b725c60d0ab3c84970b9b5f68d4b733a7bcb80fb25967e5addb9920"}, + {file = "tiktoken-0.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ca30367ad750ee7d42fe80079d3092bd35bb266be7882b79c3bd159b39a17b0"}, + {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3dc3df19ddec79435bb2a94ee46f4b9560d0299c23520803d851008445671197"}, + {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d980fa066e962ef0f4dad0222e63a484c0c993c7a47c7dafda844ca5aded1f3"}, + {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:329f548a821a2f339adc9fbcfd9fc12602e4b3f8598df5593cfc09839e9ae5e4"}, + {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b1a038cee487931a5caaef0a2e8520e645508cde21717eacc9af3fbda097d8bb"}, + {file = "tiktoken-0.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:08efa59468dbe23ed038c28893e2a7158d8c211c3dd07f2bbc9a30e012512f1d"}, + {file = "tiktoken-0.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3020350685e009053829c1168703c346fb32c70c57d828ca3742558e94827a9"}, + {file = "tiktoken-0.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba16698c42aad8190e746cd82f6a06769ac7edd415d62ba027ea1d99d958ed93"}, + {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c15d9955cc18d0d7ffcc9c03dc51167aedae98542238b54a2e659bd25fe77ed"}, + {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64e1091c7103100d5e2c6ea706f0ec9cd6dc313e6fe7775ef777f40d8c20811e"}, + {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e87751b54eb7bca580126353a9cf17a8a8eaadd44edaac0e01123e1513a33281"}, + {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e063b988b8ba8b66d6cc2026d937557437e79258095f52eaecfafb18a0a10c03"}, + {file = "tiktoken-0.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:9c6dd439e878172dc163fced3bc7b19b9ab549c271b257599f55afc3a6a5edef"}, + {file = "tiktoken-0.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8d1d97f83697ff44466c6bef5d35b6bcdb51e0125829a9c0ed1e6e39fb9a08fb"}, + {file = "tiktoken-0.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b6bce7c68aa765f666474c7c11a7aebda3816b58ecafb209afa59c799b0dd2d"}, + {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a73286c35899ca51d8d764bc0b4d60838627ce193acb60cc88aea60bddec4fd"}, + {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0394967d2236a60fd0aacef26646b53636423cc9c70c32f7c5124ebe86f3093"}, + {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:dae2af6f03ecba5f679449fa66ed96585b2fa6accb7fd57d9649e9e398a94f44"}, + {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55e251b1da3c293432179cf7c452cfa35562da286786be5a8b1ee3405c2b0dd2"}, + {file = "tiktoken-0.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:c835d0ee1f84a5aa04921717754eadbc0f0a56cf613f78dfc1cf9ad35f6c3fea"}, + {file = "tiktoken-0.4.0.tar.gz", hash = "sha256:59b20a819969735b48161ced9b92f05dc4519c17be4015cfb73b65270a243620"}, +] + +[package.dependencies] +regex = ">=2022.1.18" +requests = ">=2.26.0" + +[package.extras] +blobfile = ["blobfile (>=2)"] + [[package]] name = "tokenizers" version = "0.13.3" @@ -2762,13 +2807,13 @@ files = [ [[package]] name = "tqdm" -version = "4.66.0" +version = "4.66.1" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.66.0-py3-none-any.whl", hash = "sha256:39d459c7140b7890174e69d4d68d6291bc774a55b4bc5d93c0b760798ac5a03e"}, - {file = "tqdm-4.66.0.tar.gz", hash = "sha256:cc6e7e52202d894e66632c5c8a9330bd0e3ff35d2965c93ca832114a3d865362"}, + {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, + {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, ] [package.dependencies] @@ -2981,13 +3026,13 @@ test = ["Cython (>=0.29.32,<0.30.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "my [[package]] name = "virtualenv" -version = "20.24.2" +version = "20.24.3" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.24.2-py3-none-any.whl", hash = "sha256:43a3052be36080548bdee0b42919c88072037d50d56c28bd3f853cbe92b953ff"}, - {file = "virtualenv-20.24.2.tar.gz", hash = "sha256:fd8a78f46f6b99a67b7ec5cf73f92357891a7b3a40fd97637c27f854aae3b9e0"}, + {file = "virtualenv-20.24.3-py3-none-any.whl", hash = "sha256:95a6e9398b4967fbcb5fef2acec5efaf9aa4972049d9ae41f95e0972a683fd02"}, + {file = "virtualenv-20.24.3.tar.gz", hash = "sha256:e5c3b4ce817b0b328af041506a2a299418c98747c4b1e68cb7527e74ced23efc"}, ] [package.dependencies] @@ -3264,4 +3309,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.12" -content-hash = "09b071c03e16e84be7d44b7cfc1670a2035ac8f96c5894cfd11726355a9fe3b4" +content-hash = "7bd21a07f403c13e49b67e13595f46dbac47bfdb4d1d0fc5fd2f40f08e62f886" diff --git a/pyproject.toml b/pyproject.toml index 2d80eb7..ef68833 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,14 +15,14 @@ python = ">=3.8.1,<3.12" poetry = "^1.5.1" uvicorn = { extras = ["standard"], version = "^0.23" } -fastapi = "^0.100.1" +fastapi = ">=0.100.1" orjson = "^3.9" sse-starlette = "^1.6" psutil = "^5.9" cmake = ">=3.18.0" filelock = "^3.12" transformers = "^4.31.0" -tensorflow-hub = "^0.14" +tensorflow-hub = ">=0.14" numpy = "^1.24.3" safetensors = "^0.3.1" ninja = "^1.11.1" @@ -31,6 +31,7 @@ pydantic = "^2.0.0" pydantic-settings = "^2.0.0" sentencepiece = ">=0.1.97" typing-extensions = ">=4.0.0" +tiktoken = ">=0.4.0" # torch: 2.0.1+cu118 for GPU, 2.0.1+cpu for CPU [tool.poetry.group.dev.dependencies] diff --git a/requirements.txt b/requirements.txt index ac45196..7801854 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,15 +8,15 @@ cffi==1.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" and (s charset-normalizer==3.2.0 ; python_full_version >= "3.8.1" and python_version < "3.12" cleo==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12" click==8.1.6 ; python_full_version >= "3.8.1" and python_version < "3.12" -cmake==3.27.1 ; python_full_version >= "3.8.1" and python_version < "3.12" +cmake==3.27.2 ; python_full_version >= "3.8.1" and python_version < "3.12" colorama==0.4.6 ; python_full_version >= "3.8.1" and python_version < "3.12" and (os_name == "nt" or platform_system == "Windows") crashtest==0.4.1 ; python_full_version >= "3.8.1" and python_version < "3.12" cryptography==41.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux" diskcache==5.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12" distlib==0.3.7 ; python_full_version >= "3.8.1" and python_version < "3.12" dulwich==0.21.5 ; python_full_version >= "3.8.1" and python_version < "3.12" -exceptiongroup==1.1.2 ; python_full_version >= "3.8.1" and python_version < "3.11" -fastapi==0.100.1 ; python_full_version >= "3.8.1" and python_version < "3.12" +exceptiongroup==1.1.3 ; python_full_version >= "3.8.1" and python_version < "3.11" +fastapi==0.101.1 ; python_full_version >= "3.8.1" and python_version < "3.12" filelock==3.12.2 ; python_full_version >= "3.8.1" and python_version < "3.12" fsspec==2023.6.0 ; python_full_version >= "3.8.1" and python_version < "3.12" h11==0.14.0 ; python_full_version >= "3.8.1" and python_version < "3.12" @@ -51,7 +51,7 @@ psutil==5.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12" ptyprocess==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12" pycparser==2.21 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux") pydantic-core==2.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12" -pydantic-settings==2.0.2 ; python_full_version >= "3.8.1" and python_version < "3.12" +pydantic-settings==2.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" pydantic==2.1.1 ; python_full_version >= "3.8.1" and python_version < "3.12" pyproject-hooks==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12" python-dotenv==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12" @@ -69,20 +69,21 @@ sentencepiece==0.1.99 ; python_full_version >= "3.8.1" and python_version < "3.1 shellingham==1.5.0.post1 ; python_full_version >= "3.8.1" and python_version < "3.12" six==1.16.0 ; python_full_version >= "3.8.1" and python_version < "3.12" sniffio==1.3.0 ; python_full_version >= "3.8.1" and python_version < "3.12" -sse-starlette==1.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12" +sse-starlette==1.6.5 ; python_full_version >= "3.8.1" and python_version < "3.12" starlette==0.27.0 ; python_full_version >= "3.8.1" and python_version < "3.12" tensorflow-hub==0.14.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +tiktoken==0.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12" tokenizers==0.13.3 ; python_full_version >= "3.8.1" and python_version < "3.12" tomli==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.11" tomlkit==0.12.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -tqdm==4.66.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +tqdm==4.66.1 ; python_full_version >= "3.8.1" and python_version < "3.12" transformers==4.31.0 ; python_full_version >= "3.8.1" and python_version < "3.12" trove-classifiers==2023.8.7 ; python_full_version >= "3.8.1" and python_version < "3.12" typing-extensions==4.7.1 ; python_full_version >= "3.8.1" and python_version < "3.12" urllib3==1.26.16 ; python_full_version >= "3.8.1" and python_version < "3.12" uvicorn[standard]==0.23.2 ; python_full_version >= "3.8.1" and python_version < "3.12" uvloop==0.17.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_full_version >= "3.8.1" and python_version < "3.12" -virtualenv==20.24.2 ; python_full_version >= "3.8.1" and python_version < "3.12" +virtualenv==20.24.3 ; python_full_version >= "3.8.1" and python_version < "3.12" watchfiles==0.19.0 ; python_full_version >= "3.8.1" and python_version < "3.12" webencodings==0.5.1 ; python_full_version >= "3.8.1" and python_version < "3.12" websockets==11.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" From 086658326e68f4dcec425f659535449c01639e3b Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 15 Aug 2023 21:15:38 +0900 Subject: [PATCH 12/15] Implemented OpenAI compatible logit bias --- llama_api/logits/bias.py | 64 ++++++++++++++++++++++++--------- llama_api/mixins/logits.py | 5 +-- llama_api/modules/exllama.py | 14 +++++--- llama_api/schemas/api.py | 12 +++---- llama_api/server/pools/llama.py | 1 + 5 files changed, 64 insertions(+), 32 deletions(-) diff --git a/llama_api/logits/bias.py b/llama_api/logits/bias.py index ebdae0f..5c716e4 100644 --- a/llama_api/logits/bias.py +++ b/llama_api/logits/bias.py @@ -1,10 +1,30 @@ -from typing import TYPE_CHECKING, Callable, Dict, List, Literal, Optional +from typing import ( + TYPE_CHECKING, + Callable, + Dict, + List, + Optional, +) +from ..utils.logger import ApiLogger from .base import BaseLogitProcessor if TYPE_CHECKING: import torch as pytorch +logger = ApiLogger(__name__) + +try: + import tiktoken + + openai_decoder = tiktoken.get_encoding("cl100k_base").decode +except Exception as e: + logger.warning( + "Could not load tiktoken, which is required for OpenAI GPT models. " + f"Please `pip install tiktoken` to use the OpenAI encoder: {e}" + ) + openai_decoder: Optional[Callable[[List[int]], str]] = None + class LogitBiasProcessor(BaseLogitProcessor): """Create a logit bias processor to bias the logit scores.""" @@ -12,23 +32,33 @@ class LogitBiasProcessor(BaseLogitProcessor): def __init__( self, logit_bias: Dict[str, float], - logit_bias_type: Optional[Literal["input_ids", "tokens"]], encoder: Callable[[str], List[int]], + is_openai: bool = False, ): - if logit_bias_type is None: - logit_bias_type = "input_ids" + """Create a logit bias processor to bias the logit scores.""" + + global openai_decoder - to_bias = {} # type: Dict[int, float] - if logit_bias_type == "input_ids": - for input_id_string, score in logit_bias.items(): - to_bias[int(input_id_string)] = score + biases = {} # type: Dict[int, float] + for id_or_token, bias in logit_bias.items(): + is_digit = id_or_token.isdigit() - elif logit_bias_type == "tokens": - for token, score in logit_bias.items(): - for input_id in encoder(token): - to_bias[input_id] = score + if is_digit and is_openai and openai_decoder is not None: + # If we have an OpenAI id, we need to convert it to a token + # and then encode the token to get the ids + for id in encoder(openai_decoder([int(id_or_token)])): + if abs(bias) > abs(biases.get(id, 0.0)): + biases[id] = bias + elif is_digit: + # If we have a digit, we can just use it directly + biases[int(id_or_token)] = bias + else: + # Otherwise, we need to encode the token and use the ids + for id in encoder(id_or_token): + if abs(bias) > abs(biases.get(id, 0.0)): + biases[id] = bias - self._to_bias = to_bias + self._biases = biases self._bias_tensor = None def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor": @@ -38,8 +68,8 @@ def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor": self._bias_tensor = torch.zeros( scores.shape[-1], dtype=scores.dtype, device=scores.device ) - for idx, value in self._to_bias.items(): - self._bias_tensor[idx] = value + for id, bias in self._biases.items(): + self._bias_tensor[id] = bias return self._bias_tensor @@ -51,6 +81,6 @@ def with_torch( def without_torch( self, input_ids: List[int], scores: List[float] ) -> List[float]: - for id, biased_score in self._to_bias.items(): - scores[id] += biased_score + for id, bias in self._biases.items(): + scores[id] += bias return scores diff --git a/llama_api/mixins/logits.py b/llama_api/mixins/logits.py index 75867a1..b90bfce 100644 --- a/llama_api/mixins/logits.py +++ b/llama_api/mixins/logits.py @@ -9,7 +9,8 @@ class LogitsMixin: @staticmethod def get_logit_processors( - settings: TextGenerationSettings, encoder: Callable[[str], List[int]] + settings: TextGenerationSettings, + encoder: Callable[[str], List[int]], ) -> List[BaseLogitProcessor]: logit_processors: List[BaseLogitProcessor] = [] if settings.muse: @@ -27,8 +28,8 @@ def get_logit_processors( 0, LogitBiasProcessor( logit_bias=settings.logit_bias, - logit_bias_type=settings.logit_bias_type, encoder=encoder, + is_openai=settings.is_openai, ), ) return logit_processors diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index 7d9e960..df8564f 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -322,8 +322,13 @@ def _generate_text_with_streaming( generator = _apply_settings_to_generator(cg, settings=settings) # Start the generator + context_window = cg.llm_model.max_total_tokens if settings.guidance_scale == 1: ids = _encode(cg.tokenizer, prompt) + prompt_tokens = ids.shape[-1] + cg.raise_for_token_limit( + prompt_tokens=prompt_tokens, context_window=context_window + ) mask = None # type: Optional[Tensor] generator.end_beam_search() generator.gen_begin_reuse(ids) @@ -333,13 +338,12 @@ def _generate_text_with_streaming( [prompt, settings.negative_prompt or ""], return_mask=True, ) + prompt_tokens = ids.shape[-1] + cg.raise_for_token_limit( + prompt_tokens=prompt_tokens, context_window=context_window + ) generator.gen_begin(ids, mask=mask) - prompt_tokens = ids.shape[-1] - context_window = cg.llm_model.max_total_tokens - cg.raise_for_token_limit( - prompt_tokens=prompt_tokens, context_window=context_window - ) settings.max_tokens = min( settings.max_tokens, context_window - prompt_tokens ) diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py index 543c76c..e052324 100644 --- a/llama_api/schemas/api.py +++ b/llama_api/schemas/api.py @@ -189,14 +189,6 @@ class TextGenerationSettings(BaseModel): "logits of the model to influence." ), ) - logit_bias_type: Literal["input_ids", "tokens"] = Field( - default="tokens", - description=( - "The type of logit bias to use. If 'input_ids', the bias is applied to the input" - " ids(integer). If 'tokens', the bias is applied to the tokens(string). If None, the bias is not " - "applied." - ), - ) ban_eos_token: bool = Field( default=False, description="If True, the EOS token is banned from being generated.", @@ -219,6 +211,10 @@ class TextGenerationSettings(BaseModel): "The negative prompt is used to encourage the model not to generate samples that are too similar to the " "negative prompt. CFG is enabled by setting `guidance_scale > 1`.", ) + is_openai: bool = Field( + default=False, + description="If True, the model is regarded as an OpenAI model.", + ) class CreateEmbeddingRequest(BaseModel): diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py index eca2af2..5d1751e 100644 --- a/llama_api/server/pools/llama.py +++ b/llama_api/server/pools/llama.py @@ -94,6 +94,7 @@ def get_completion_generator( ) if body.model in openai_replacement_models: body.model = openai_replacement_models[body.model] + body.is_openai = True # Check if the model is defined in LLMModels enum llm_model = get_model(body.model) From fbb5d0a04bcf6f7ef37f342db57dd05178183bb2 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Wed, 16 Aug 2023 01:21:51 +0900 Subject: [PATCH 13/15] Better error logger --- build_shared_lib.py | 15 ++++++ llama_api/server/pools/llama.py | 25 ++++----- llama_api/server/routers/v1.py | 8 +-- llama_api/utils/huggingface_downloader.py | 4 +- llama_api/utils/llama_cpp.py | 14 +---- llama_api/utils/logger.py | 64 ++++++++++++++++++++--- llama_api/utils/path.py | 9 +--- 7 files changed, 90 insertions(+), 49 deletions(-) create mode 100644 build_shared_lib.py diff --git a/build_shared_lib.py b/build_shared_lib.py new file mode 100644 index 0000000..819f819 --- /dev/null +++ b/build_shared_lib.py @@ -0,0 +1,15 @@ +# flake8: noqa + +from llama_api.utils.llama_cpp import ( + build_shared_lib, + CPU_ARGS, # Only use CPU + METAL_ARGS, # Only use Metal (MacOS) + CUBLAS_ARGS, # Only use CUBLAS (Nvidia) +) +from os import environ + + +if __name__ == "__main__": + environ["FORCE_CMAKE"] = "1" + environ["CMAKE_ARGS"] = CPU_ARGS # EDIT THIS LINE TO CHANGE BUILD TYPE !!! + build_shared_lib() diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py index 5d1751e..c3f5756 100644 --- a/llama_api/server/pools/llama.py +++ b/llama_api/server/pools/llama.py @@ -70,10 +70,10 @@ def get_model_names() -> List[str]: def get_model(model_name: str) -> "BaseLLMModel": """Get a model from the model_definitions.py file""" - try: + with logger.log_any_error( + f"Error getting model: {model_name}", exc_info=None + ): return getattr(model_definitions, model_name) - except Exception: - raise AssertionError(f"Could not find a model: {model_name}") def get_completion_generator( @@ -87,7 +87,9 @@ def get_completion_generator( If the model is not cached, create a new one. If the cache is full, delete the oldest completion generator.""" - try: + with logger.log_any_error( + f"Error getting a completion generator of {body.model}" + ): # Check if the model is an OpenAI model openai_replacement_models: Dict[str, str] = getattr( model_definitions, "openai_replacement_models", {} @@ -140,11 +142,6 @@ def get_completion_generator( # Add the new completion generator to the deque cache completion_generators.append(to_return) return to_return - except (AssertionError, OSError, MemoryError) as e: - raise e - except Exception as e: - logger.exception(f"Exception in get_completion_generator: {e}") - raise AssertionError(f"Could not find a model: {body.model}") def get_embedding_generator( @@ -153,7 +150,10 @@ def get_embedding_generator( """Get an embedding generator for the given model. If the model is not cached, create a new one. If the cache is full, delete the oldest completion generator.""" - try: + + with logger.log_any_error( + f"Error getting a embedding generator of {body.model}" + ): body.model = body.model.lower() for embedding_generator in embedding_generators: if embedding_generator.model_name == body.model: @@ -190,11 +190,6 @@ def get_embedding_generator( # Add the new completion generator to the deque cache embedding_generators.append(to_return) return to_return - except (AssertionError, OSError, MemoryError) as e: - raise e - except Exception as e: - logger.exception(f"Exception in get_embedding_generator: {e}") - raise AssertionError(f"Could not find a model: {body.model}") def generate_completion_chunks( diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 9a5ec15..ba0c49d 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -2,7 +2,7 @@ Use same format as OpenAI API""" -from asyncio import Task, create_task +from asyncio import CancelledError, Task, create_task from contextlib import asynccontextmanager, contextmanager from dataclasses import dataclass, field from functools import partial @@ -138,7 +138,7 @@ async def get_wix_with_semaphore( wix_meta = wix_metas[choice(candidates)] async with wix_meta.semaphore: if await request.is_disconnected(): - raise get_cancelled_exc_class()() + raise CancelledError("Request is disconnected") wix_meta.processed_key = request_key yield wix_meta.wix @@ -200,10 +200,10 @@ async def get_event_publisher( if await request.is_disconnected(): raise get_cancelled_exc_class()() await inner_send_chan.send(b"data: [DONE]\n\n") - except get_cancelled_exc_class() as e: + except get_cancelled_exc_class(): with move_on_after(1, shield=True): task_status["interrupted"] = True - raise e + raise def get_streaming_iterator( diff --git a/llama_api/utils/huggingface_downloader.py b/llama_api/utils/huggingface_downloader.py index eedbf01..57fea87 100644 --- a/llama_api/utils/huggingface_downloader.py +++ b/llama_api/utils/huggingface_downloader.py @@ -84,10 +84,10 @@ def __init__( ) except ValueError as err_branch: logger.error(err_branch) - raise err_branch + raise except HTTPError as err_http: logger.error(err_http) - raise err_http + raise @property def model(self) -> str: diff --git a/llama_api/utils/llama_cpp.py b/llama_api/utils/llama_cpp.py index fc16ef2..b1de480 100644 --- a/llama_api/utils/llama_cpp.py +++ b/llama_api/utils/llama_cpp.py @@ -1,9 +1,8 @@ import shutil import subprocess import sys -from contextlib import contextmanager from logging import Logger, getLogger -from os import chdir, environ, getcwd +from os import environ from pathlib import Path from typing import List, Optional, Union @@ -54,17 +53,6 @@ } -@contextmanager -def _temporary_change_cwd(path): - # Change the current working directory to `path` and then change it back - prev_cwd = getcwd() - chdir(path) - try: - yield - finally: - chdir(prev_cwd) - - def _git_clone_if_not_exists() -> None: # Clone the git repos if they don't exist for clone_path, clone_command in GIT_CLONES.items(): diff --git a/llama_api/utils/logger.py b/llama_api/utils/logger.py index dbefbbb..093f005 100644 --- a/llama_api/utils/logger.py +++ b/llama_api/utils/logger.py @@ -1,9 +1,10 @@ """Logger module for the API""" - +# flake8: noqa +from contextlib import contextmanager import logging from dataclasses import dataclass from pathlib import Path -from typing import Dict, Optional +from typing import Callable, Dict, Generator, Optional, Union from .colorama import Fore, Style @@ -82,7 +83,7 @@ def __init__( self.addHandler(console) @classmethod - def cinfo(cls, msg: str, *args, **kwargs) -> None: + def cinfo(cls, msg: object, *args, **kwargs) -> None: if cls.__name__ not in cls._instances: cls(cls.__name__) super( @@ -91,7 +92,7 @@ def cinfo(cls, msg: str, *args, **kwargs) -> None: ).info(msg, *args, **kwargs) @classmethod - def cdebug(cls, msg: str, *args, **kwargs) -> None: + def cdebug(cls, msg: object, *args, **kwargs) -> None: if cls.__name__ not in cls._instances: cls(cls.__name__) super(ApiLogger, cls._instances[cls.__name__]).debug( @@ -99,7 +100,7 @@ def cdebug(cls, msg: str, *args, **kwargs) -> None: ) @classmethod - def cwarning(cls, msg: str, *args, **kwargs) -> None: + def cwarning(cls, msg: object, *args, **kwargs) -> None: if cls.__name__ not in cls._instances: cls(cls.__name__) super(ApiLogger, cls._instances[cls.__name__]).warning( @@ -107,7 +108,7 @@ def cwarning(cls, msg: str, *args, **kwargs) -> None: ) @classmethod - def cerror(cls, msg: str, *args, **kwargs) -> None: + def cerror(cls, msg: object, *args, **kwargs) -> None: if cls.__name__ not in cls._instances: cls(cls.__name__) super(ApiLogger, cls._instances[cls.__name__]).error( @@ -115,7 +116,7 @@ def cerror(cls, msg: str, *args, **kwargs) -> None: ) @classmethod - def cexception(cls, msg: str, *args, **kwargs) -> None: + def cexception(cls, msg: object, *args, **kwargs) -> None: if cls.__name__ not in cls._instances: cls(cls.__name__) super(ApiLogger, cls._instances[cls.__name__]).exception( @@ -123,9 +124,56 @@ def cexception(cls, msg: str, *args, **kwargs) -> None: ) @classmethod - def ccritical(cls, msg: str, *args, **kwargs) -> None: + def ccritical(cls, msg: object, *args, **kwargs) -> None: if cls.__name__ not in cls._instances: cls(cls.__name__) super(ApiLogger, cls._instances[cls.__name__]).critical( msg, *args, **kwargs ) + + @contextmanager + def log_any_error( + self, + msg: Optional[object] = None, + level: int = logging.ERROR, + exc_info: Optional[Union[bool, Exception]] = True, + suppress_exception: bool = False, + on_error: Optional[Callable[[Exception], None]] = None, + *args, + **kwargs, + ) -> Generator[None, None, None]: + """ + A context manager to automatically log exceptions that occur within its context. + + Args: + msg (Optional[object], default=None): An optional message to be prepended to the exception message in the log. + level (int, default=logging.ERROR): The logging level at which the exception should be logged. Default is ERROR. + exc_info (logging._ExcInfoType, default=True): If set to True, exception information will be added to the log. Otherwise, only the exception message will be logged. + suppress_exception (bool, default=False): If True, the exception will be suppressed (not re-raised). If False, the exception will be re-raised after logging. + on_error (Optional[Callable[[Exception], None]], default=None): A callback function that will be invoked with the exception as its argument if one occurs. + *args: Variable length argument list passed to the logging function. + **kwargs: Arbitrary keyword arguments passed to the logging function. + + Usage: + with logger.log_any_error(msg="An error occurred", level=logging.WARNING, on_error=my_callback_function): + potentially_faulty_function() + + Notes: + - If a custom message is provided using the 'msg' parameter, it will be prepended to the actual exception message in the log. + - If 'on_error' is provided, it will be executed with the caught exception as its argument. This can be used for custom handling or notification mechanisms. + """ + + try: + yield + except Exception as e: + self.log( + level, + f"{msg}: {e}" if msg else e, + *args, + **kwargs, + exc_info=exc_info, + ) + if on_error: + on_error(e) + if not suppress_exception: + raise diff --git a/llama_api/utils/path.py b/llama_api/utils/path.py index 84f6f9d..27a51cd 100644 --- a/llama_api/utils/path.py +++ b/llama_api/utils/path.py @@ -154,7 +154,7 @@ def resolve_model_path_to_posix( model_path: str, default_relative_directory: Optional[str] = None ) -> str: """Resolve a model path to a POSIX path.""" - try: + with logger.log_any_error("Error resolving model path"): path = Path(model_path) if path.is_absolute(): # The path is already absolute @@ -191,9 +191,6 @@ def resolve_model_path_to_posix( ) # Try to resolve the model path from Huggingface return HuggingfaceResolver(model_path).resolve() - except Exception as e: - logger.error(f"Error resolving model path: {e}") - raise e def resolve_model_path_to_posix_with_cache( @@ -230,11 +227,9 @@ def resolve_model_path_to_posix_with_cache( cache[model_path] = resolved # Update the cache file - try: + with logger.log_any_error("Error writing model path cache"): with open(cache_file, "w") as f: f.write(orjson.dumps(cache).decode()) - except Exception as e: - logger.error(f"Error writing model path cache: {e}") return resolved except (Timeout, TypeError) as e: logger.warning( From 7da7b60ac33fdf153f1df5153da4f371bd0b4df6 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Wed, 16 Aug 2023 01:22:02 +0900 Subject: [PATCH 14/15] lora support for exllama --- llama_api/modules/exllama.py | 42 ++++++-- llama_api/modules/exllama_lora.py | 169 ++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 11 deletions(-) create mode 100644 llama_api/modules/exllama_lora.py diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index df8564f..bd868c3 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -7,14 +7,13 @@ logger = ApiLogger(__name__) if environ.get("LLAMA_API_XFORMERS") == "1": - try: + with logger.log_any_error( + "xformers mode is enabled, but xformers is not installed", + suppress_exception=True, + ): from ..modules.xformers import hijack_attention_forward hijack_attention_forward() - except Exception as e: - logger.warning( - f"xformers mode is enabled, but xformers is not installed: {e}" - ) from pathlib import Path from typing import ( TYPE_CHECKING, @@ -43,6 +42,7 @@ from ..utils.dependency import import_repository from ..utils.system import deallocate_memory from .base import BaseCompletionGenerator +from .exllama_lora import ExLlamaLora with import_repository( git_path="https://github.com/turboderp/exllama", @@ -309,7 +309,7 @@ def _generate_text_with_streaming( prompt: str, settings: "TextGenerationSettings", ) -> Iterator[str]: - try: + with logger.log_any_error(): # Make sure that the stop token is a list if isinstance(settings.stop, str): stops = [settings.stop] # type: List[str] @@ -321,6 +321,10 @@ def _generate_text_with_streaming( # Apply the settings to the generator generator = _apply_settings_to_generator(cg, settings=settings) + # Apply the LORA model + if cg.lora: + generator.lora = cg.lora # type: ignore + # Start the generator context_window = cg.llm_model.max_total_tokens if settings.guidance_scale == 1: @@ -351,9 +355,6 @@ def _generate_text_with_streaming( yield from _generator( cg, settings=settings, cfg_mask=mask, stops=stops ) - except Exception as e: - logger.exception(e) - raise e class ExllamaCompletionGenerator(BaseCompletionGenerator): @@ -363,6 +364,7 @@ class ExllamaCompletionGenerator(BaseCompletionGenerator): _tokenizer: Optional[ExLlamaTokenizer] = None _generator: Optional[ExLlamaGenerator] = None _llm_model: Optional["ExllamaModel"] = None + _lora: Optional["ExLlamaLora"] = None _completion_status: Dict[ str, int ] = {} # key: completion_id, value: number of completion tokens @@ -397,22 +399,40 @@ def config(self) -> ExLlamaConfig: assert self._config is not None, "Config is not initialized." return self._config + @property + def lora(self) -> Optional[ExLlamaLora]: + return self._lora + @classmethod def from_pretrained( cls, llm_model: "ExllamaModel" ) -> "ExllamaCompletionGenerator": - result = cls() model_folder_path = Path(llm_model.model_path_resolved) + lora_path = model_folder_path / "adapter_model.bin" + lora_config_path = model_folder_path / "adapter_config.json" + + result = cls() + result._llm_model = llm_model result._config = _make_config(model_folder_path, llm_model) result._tokenizer = ExLlamaTokenizer( (model_folder_path / "tokenizer.model").as_posix() ) result._model = ExLlama(result._config) + if lora_path.exists() and lora_config_path.exists(): + logger.info(f"🦙 LORA model found for {result.model_name}") + with logger.log_any_error( + f"🦙 LORA model loading failed for {result.model_name}" + ): + result._lora = ExLlamaLora( + model=result._model, + lora_config_path=lora_config_path.as_posix(), + lora_path=lora_path.as_posix(), + ) + logger.info(f"🦙 LORA model loaded for {result.model_name}") result._cache = ExLlamaCache(result._model) result._generator = ExLlamaGenerator( result._model, result._tokenizer, result._cache ) - result._llm_model = llm_model return result def generate_completion_with_streaming( diff --git a/llama_api/modules/exllama_lora.py b/llama_api/modules/exllama_lora.py new file mode 100644 index 0000000..7f2c3c9 --- /dev/null +++ b/llama_api/modules/exllama_lora.py @@ -0,0 +1,169 @@ +# flake8: noqa +from pathlib import Path +from typing import Dict, Union +from llama_api.utils.dependency import import_repository + +with import_repository( + git_path="https://github.com/turboderp/exllama", + disk_path="repositories/exllama", +): + from repositories.exllama.model import ExLlama, Ex4bitLinear, ExLlamaConfig + +import json + +import torch +from safetensors.torch import load_file as safe_load_file +from torch import load as load_file + + +class ExLlamaLora: + lora_config_path: str + lora_path: str + lora_r: int + lora_alpha: float + lora_scaling: float + config: ExLlamaConfig + tensors: Dict[str, torch.Tensor] + bias_ignored: bool + + def __init__( + self, + model: ExLlama, + lora_config_path: Union[str, Path], + lora_path: Union[str, Path], + ): + self.lora_config_path = str(lora_config_path) + self.lora_path = str(lora_path) + self.model = model + self.config = model.config + self.tensors = {} + self.bias_ignored = False + + # Grab relevant items from LoRA config + with open(lora_config_path) as f: + read_config = json.load(f) + + self.lora_r = read_config["r"] + self.lora_alpha = float(read_config["lora_alpha"]) + self.lora_scaling = self.lora_alpha / self.lora_r + + if "fan_in_fan_out" in read_config and read_config["fan_in_fan_out"]: + raise ValueError(" ## Error: fan_in_fan_out mode not supported.") + + # Load LoRA weights + if self.lora_path.endswith(".safetensors"): + f = safe_load_file(self.lora_path, device="cpu") + else: + f = load_file(self.lora_path, map_location="cpu") + + for key in f.keys(): + tensor = f[key] + + # Find target module + i = key.find("model.layers.") + if i == -1: + raise ValueError( + f" ## Error: unsupported layer in {self.lora_path}: {key}" + ) + + target_key = key[i:] + ks = target_key.split(".") + decoder_idx = int(ks[2]) + decoder_part = ks[3] + decoder_layer = ks[4] + lora_half = ks[5] + + if lora_half == "bias": + epsilon = 1e-6 + if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon: + raise ValueError( + f" ## Error: unsupported bias target {self.lora_path}: {key}" + ) + self.bias_ignored = True + continue + + target_module = self.model.layers[decoder_idx] + if decoder_part == "self_attn": + target_module = target_module.self_attn + elif decoder_part == "mlp": + target_module = target_module.mlp + else: + raise ValueError( + f" ## Error: unsupported layer in {self.lora_path}: {key}" + ) + + if decoder_layer == "q_proj": + target_module = target_module.q_proj + elif decoder_layer == "k_proj": + target_module = target_module.k_proj + elif decoder_layer == "v_proj": + target_module = target_module.v_proj + elif decoder_layer == "o_proj": + target_module = target_module.o_proj + elif decoder_layer == "gate_proj": + target_module = target_module.gate_proj + elif decoder_layer == "up_proj": + target_module = target_module.up_proj + elif decoder_layer == "down_proj": + target_module = target_module.down_proj + else: + raise ValueError( + f" ## Error: unsupported layer in {self.lora_path}: {key}" + ) + + # Check that shape is compatible + assert isinstance( + target_module, Ex4bitLinear + ), f"Target module {target_module} is not Ex4bitLinear, but {type(target_module)}" + + if lora_half == "lora_A": + in_features = tensor.shape[1] + out_features = None + elif lora_half == "lora_B": + in_features = None + out_features = tensor.shape[0] + else: + raise ValueError( + f" ## Error: unsupported layer in {self.lora_path}: {key}" + ) + + if (in_features and in_features != target_module.in_features) or ( + out_features and out_features != target_module.out_features + ): + raise ValueError( + f" ## Error: incompatible tensor shape in {self.lora_path}: {key}" + ) + + # For efficiency, transpose adapter instead of transposing state during inference + + tensor = tensor.T.contiguous() + + # Pre-scale + + if lora_half == "lora_B" and self.lora_scaling != 1.0: + tensor.mul_(self.lora_scaling) + + # Check that dtype is compatible, or convert + + if tensor.dtype == torch.bfloat16: + tensor = tensor.to(torch.float16) + + elif tensor.dtype == torch.float32: + tensor = tensor.to(torch.float16) + + elif tensor.dtype == torch.float16: + pass + + else: + raise ValueError( + f" ## Error: unsupported tensor dtype in {self.lora_path}" + ) + + # Move to target device + + device = self.config.device_map.map(target_key) + tensor = tensor.to(device, non_blocking=True) + + # Store adapter tensor + + self.tensors[target_key] = tensor From 1f111ba1d938d89b97045904e1c373be90e1cf28 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Thu, 17 Aug 2023 00:04:50 +0900 Subject: [PATCH 15/15] update: docker image & readme --- .gitignore | 3 +- build_shared_lib.py | 19 ++++++++- docker-compose.persistent.yml | 2 +- docker-compose.yml | 2 +- llama_api/server/routers/v1.py | 4 +- main.py | 4 +- readme.md | 73 +++++++++++++++++++--------------- requirements-all.txt | 26 ------------ 8 files changed, 67 insertions(+), 66 deletions(-) delete mode 100644 requirements-all.txt diff --git a/.gitignore b/.gitignore index 4836eb0..a038e86 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ repositories/ .vscode/ .test-venv/ .temp/ -PRIVATE_* \ No newline at end of file +PRIVATE_* +private/* \ No newline at end of file diff --git a/build_shared_lib.py b/build_shared_lib.py index 819f819..594403c 100644 --- a/build_shared_lib.py +++ b/build_shared_lib.py @@ -1,5 +1,6 @@ # flake8: noqa +from argparse import ArgumentParser from llama_api.utils.llama_cpp import ( build_shared_lib, CPU_ARGS, # Only use CPU @@ -8,8 +9,24 @@ ) from os import environ +ARGS = { + "CPU": CPU_ARGS, + "METAL": METAL_ARGS, + "CUBLAS": CUBLAS_ARGS, + "CUDA": CUBLAS_ARGS, +} if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "-b", + "--build_type", + type=lambda s: str(s).upper(), + default="CPU", + choices=["CPU", "METAL", "CUBLAS", "CUDA"], + help="Build type", + ) + environ["FORCE_CMAKE"] = "1" - environ["CMAKE_ARGS"] = CPU_ARGS # EDIT THIS LINE TO CHANGE BUILD TYPE !!! + environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type] build_shared_lib() diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml index 08605d3..f018d07 100644 --- a/docker-compose.persistent.yml +++ b/docker-compose.persistent.yml @@ -5,7 +5,7 @@ volumes: services: llama-api: - image: cosogi/llama-api:230814 + image: cosogi/llama-api:230816 entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 diff --git a/docker-compose.yml b/docker-compose.yml index a914dfa..3c910ea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3' services: llama-api: - image: cosogi/llama-api:230814 + image: cosogi/llama-api:230816 entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index ba0c49d..b2aeb47 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -2,7 +2,7 @@ Use same format as OpenAI API""" -from asyncio import CancelledError, Task, create_task +from asyncio import Task, create_task from contextlib import asynccontextmanager, contextmanager from dataclasses import dataclass, field from functools import partial @@ -138,7 +138,7 @@ async def get_wix_with_semaphore( wix_meta = wix_metas[choice(candidates)] async with wix_meta.semaphore: if await request.is_disconnected(): - raise CancelledError("Request is disconnected") + return wix_meta.processed_key = request_key yield wix_meta.wix diff --git a/main.py b/main.py index 7f8bd16..15877df 100644 --- a/main.py +++ b/main.py @@ -62,7 +62,7 @@ help="Apply xformers' memory-efficient optimizations", ) parser.add_argument( - "--disable-embeddings", + "--no-embed", action="store_true", help="Disable embeddings endpoint", ) @@ -80,6 +80,6 @@ "LLAMA_API_XFORMERS": "1" if args.xformers else "", "LLAMA_API_API_KEY": args.api_key or "", "FORCE_CUDA": "1" if args.force_cuda else "", - "LLAMA_API_EMBEDDINGS": "1" if not args.disable_embeddings else "", + "LLAMA_API_EMBEDDINGS": "1" if not args.no_embed else "", }, ) diff --git a/readme.md b/readme.md index 7a3eadd..5a54926 100644 --- a/readme.md +++ b/readme.md @@ -3,11 +3,51 @@ This project aims to provide a simple way to run **LLama.cpp** and **Exllama** m You can use this server to run the models in your own application, or use it as a standalone API server! +## Before you start + +1. **Python 3.8 / 3.9 / 3.10 / 3.11** is required to run the server. You can download it from https://www.python.org/downloads/ + +2. **llama.cpp**: To use llama.cpp, and if you are **Windows** user, download [CMake](https://cmake.org/download/) to compile library. + +3. **ExLlama**: To use ExLlama, install the prerequisites of this [repository](https://github.com/turboderp/exllama). Maybe **Windows** user needs to install both [MSVC 2022](https://visualstudio.microsoft.com/downloads/) and [CUDA Toolkit 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive). + + + +## How to run server + +All required packages will be installed automatically with this command. + +```bash +python -m main --install-pkgs +``` + +If you already have all required packages installed, you can skip the installation with this command. +```bash +python -m main +``` +Options: +```b +usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-i] [-c] [--skip-torch-install] [--skip-tf-install] [--skip-compile] [-k API_KEY] [-x] [--no-embed] + +options: + -h, --help show this help message and exit + -p PORT, --port PORT Port to run the server on; default is 8000 + -w MAX_WORKERS, --max-workers MAX_WORKERS + Maximum number of process workers to run; default is 1 + -i, --install-pkgs Install all required packages before running the server + -c, --force-cuda Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118 + --skip-torch-install Skip installing pytorch, if `install-pkgs` is set + --skip-tf-install Skip installing tensorflow, if `install-pkgs` is set + --skip-compile Skip compiling the shared library of LLaMA C++ code + -k API_KEY, --api-key API_KEY + API key to use for the server + -x, --xformers Apply xformers' memory-efficient optimizations + --no-embed Disable embeddings endpoint +``` ### Unique features 1. **On-Demand Model Loading** - > **Caution:** There is a bug where VRAM does not get freed when unloading, if **cuBLAS** is used in **llama.cpp**. This issue has been reported for a while but it's still unresolved. - The project tries to load the model defined in `model_definitions.py` into the worker process when it is sent along with the request JSON body. The worker continually uses the cached model and when a request for a different model comes in, it unloads the existing model and loads the new one. 2. **Parallelism and Concurrency Enabled** @@ -16,13 +56,6 @@ You can use this server to run the models in your own application, or use it as 3. **Auto Dependency Installation** - The project automatically do git clones and installs the required dependencies, including **pytorch** and **tensorflow**, when the server is started. This is done by checking the `pyproject.toml` or `requirements.txt` file in the root directory of this project or other repositories. `pyproject.toml` will be parsed into `requirements.txt` with `poetry`. If you want to add more dependencies, simply add them to the file. -## Before you start - -1. **Python 3.8 / 3.9 / 3.10 / 3.11** is required to run the server. You can download it from https://www.python.org/downloads/ - -2. **llama.cpp**: To use llama.cpp, and if you are **Windows** user, download [CMake](https://cmake.org/download/) to compile library. - -3. **ExLlama**: To use ExLlama, install the prerequisites of this [repository](https://github.com/turboderp/exllama). Maybe **Windows** user needs to install both [MSVC 2022](https://visualstudio.microsoft.com/downloads/) and [CUDA Toolkit 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive). ## How to download the models @@ -62,31 +95,7 @@ The path of the model has to be the folder name. Let's say, **orca_mini_7b**, wh ## Where to define the models Define llama.cpp & exllama models in `model_definitions.py`. You can define all necessary parameters to load the models there. Refer to the example in the file. -## How to run server - -All required packages will be installed automatically with this command. - -```bash -python -m main --install-pkgs -``` -If you already have all required packages installed, you can skip the installation with this command. -```bash -python -m main -``` -Options: -```b - -h, --help show this help message and exit - -p PORT, --port PORT Port to run the server on; default is 8000 - -w MAX_WORKERS, --max-workers MAX_WORKERS - Maximum number of process workers to run; default is 1 - --install-pkgs Install all required packages before running the server - --force-cuda Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118 - --skip-torch-install Skip installing pytorch, if `install-pkgs` is set - --skip-tf-install Skip installing tensorflow, if `install-pkgs` is set - -k API_KEY, --api-key API_KEY - API key to use for the server -``` ## Usage: Text Completion Now, you can send a request to the server. diff --git a/requirements-all.txt b/requirements-all.txt deleted file mode 100644 index 8e1425d..0000000 --- a/requirements-all.txt +++ /dev/null @@ -1,26 +0,0 @@ -psutil -fastapi -uvicorn[standard] -transformers -orjson -llama_cpp_python[server] -safetensors==0.3.1 -sentencepiece>=0.1.97 -ninja==1.11.1 ---find-links https://download.pytorch.org/whl/torch_stable.html -torch==2.0.1+cu118 -numpy -scikit-learn -tensorflow>=2.0.0 -tensorflow-hub -scikit-build - - -# Dev -black -twine -flake8 -mkdocs -mkdocstrings -mkdocs-material -httpx \ No newline at end of file