diff --git a/run_localGPT_API.py b/run_localGPT_API.py index d3a42d71..f36be656 100644 --- a/run_localGPT_API.py +++ b/run_localGPT_API.py @@ -25,7 +25,7 @@ ) from werkzeug.utils import secure_filename -from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY +from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu" SHOW_SOURCES = True @@ -64,30 +64,7 @@ RETRIEVER = DB.as_retriever() -# for HF models -# model_id = "TheBloke/vicuna-7B-1.1-HF" -# model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF" -# model_id = "TheBloke/guanaco-7B-HF" -# model_id = 'NousResearch/Nous-Hermes-13b' # Requires ~ 23GB VRAM. -# Using STransformers alongside will 100% create OOM on 24GB cards. -# LLM = load_model(device_type=DEVICE_TYPE, model_id=model_id) - -# for GPTQ (quantized) models -# model_id = "TheBloke/Nous-Hermes-13B-GPTQ" -# model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order" -# model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ" -# model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors" -# Requires ~21GB VRAM. Using STransformers alongside can potentially create OOM on 24GB cards. -# model_id = "TheBloke/wizardLM-7B-GPTQ" -# model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors" - -# model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ" -# model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors" - -model_id = "TheBloke/Llama-2-7B-Chat-GGML" -model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin" - -LLM = load_model(device_type=DEVICE_TYPE, model_id=model_id, model_basename=model_basename) +LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME) QA = RetrievalQA.from_chain_type( llm=LLM, chain_type="stuff", retriever=RETRIEVER, return_source_documents=SHOW_SOURCES