Converting HuggingFace Models to GGUF/GGML¶
This notebook is crafted for the purpose of quantizing Hugging Face models into GGUF format and subsequently uploading them to the Hub. Let's
We will be usingLlama.cpp to quantize the model and it supports the following models:
- LLaMA 🦙
- LLaMA 2 🦙🦙
- Falcon
- Alpaca
- GPT4All
- Chinese LLaMA / Alpaca and Chinese LLaMA-2 / Alpaca-2
- Vigogne (French)
- Vicuna
- Koala
- OpenBuddy 🐶 (Multilingual)
- Pygmalion 7B / Metharme 7B
- WizardLM
- Baichuan-7B and its derivations (such as baichuan-7b-sft)
- Aquila-7B / AquilaChat-7B
This notebook can be run on a free Google Colab CPU/CPU machine
On a CPU machine it took me 10 to 15 minutes to quantize a 7b model. On a GPU machine it took me 2 to 3 minutes to quantize a 7b model.
In [ ]:
Copied!
!pip install huggingface_hub
!pip install huggingface_hub
In [ ]:
Copied!
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login
notebook_login()
Download the base model from Huggingface¶
Load the base model you want to quantise to GGUF Formate
In [ ]:
Copied!
from huggingface_hub import snapshot_download
model_id = "meta-llama/Llama-2-7b-hf" # @param {type:"string"}
local_directory = model_id.split("/")[-1]
snapshot_download(repo_id=model_id,
local_dir=local_directory,
local_dir_use_symlinks=False,
revision="main")
from huggingface_hub import snapshot_download
model_id = "meta-llama/Llama-2-7b-hf" # @param {type:"string"}
local_directory = model_id.split("/")[-1]
snapshot_download(repo_id=model_id,
local_dir=local_directory,
local_dir_use_symlinks=False,
revision="main")
In [ ]:
Copied!
# @title Installing Llama.cpp
!apt update -y
!apt install build-essential git cmake libopenblas-dev libeigen3-dev -y
!git clone https://github.com/ggerganov/llama.cpp
!pip install -r llama.cpp/requirements.txt
# @title Installing Llama.cpp
!apt update -y
!apt install build-essential git cmake libopenblas-dev libeigen3-dev -y
!git clone https://github.com/ggerganov/llama.cpp
!pip install -r llama.cpp/requirements.txt
Quantisation¶
In [ ]:
Copied!
# @title Choose Quantisation Type. { display-mode: "form" }
# @markdown ### Enter your model and and Huggingface account:
MODEL_NAME = 'quantizeModelName' # @param {type: "string"}
# @markdown ### Choose Quantisation Formats:
q2_k = False # @param {type:"boolean"}
q3_k_l = False # @param {type:"boolean"}
q3_k_m = False # @param {type:"boolean"}
q3_k_s = False # @param {type:"boolean"}
q4_0 = False # @param {type:"boolean"}
q4_1 = False # @param {type:"boolean"}
q4_k_m = True # @param {type:"boolean"}
q4_k_s = False # @param {type:"boolean"}
q5_0 = False # @param {type:"boolean"}
q5_1 = False # @param {type:"boolean"}
q5_k_m = True # @param {type:"boolean"}
q5_k_s = False # @param {type:"boolean"}
q6_k = False # @param {type:"boolean"}
q8_0 = False # @param {type:"boolean"}
import os
# Check if the directory exists
if not os.path.exists(MODEL_NAME):
# If it doesn't exist, create it
os.mkdir(MODEL_NAME)
else:
print(f"The directory {MODEL_NAME} already exists.")
# @title Choose Quantisation Type. { display-mode: "form" }
# @markdown ### Enter your model and and Huggingface account:
MODEL_NAME = 'quantizeModelName' # @param {type: "string"}
# @markdown ### Choose Quantisation Formats:
q2_k = False # @param {type:"boolean"}
q3_k_l = False # @param {type:"boolean"}
q3_k_m = False # @param {type:"boolean"}
q3_k_s = False # @param {type:"boolean"}
q4_0 = False # @param {type:"boolean"}
q4_1 = False # @param {type:"boolean"}
q4_k_m = True # @param {type:"boolean"}
q4_k_s = False # @param {type:"boolean"}
q5_0 = False # @param {type:"boolean"}
q5_1 = False # @param {type:"boolean"}
q5_k_m = True # @param {type:"boolean"}
q5_k_s = False # @param {type:"boolean"}
q6_k = False # @param {type:"boolean"}
q8_0 = False # @param {type:"boolean"}
import os
# Check if the directory exists
if not os.path.exists(MODEL_NAME):
# If it doesn't exist, create it
os.mkdir(MODEL_NAME)
else:
print(f"The directory {MODEL_NAME} already exists.")
In [ ]:
Copied!
# @title Load in 16bit Precision
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {local_directory} --outtype f16 --outfile {fp16}
# @title Load in 16bit Precision
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {local_directory} --outtype f16 --outfile {fp16}
In [ ]:
Copied!
!cd llama.cpp && make LLAMA_OPENBLAS=1
!cd llama.cpp && make LLAMA_OPENBLAS=1
In [ ]:
Copied!
# @title Start Quantisation
QUANTIZATION_METHODS = [
("q2_k", q2_k),
("q3_k_l", q3_k_l),
("q3_k_m", q3_k_m),
("q3_k_s", q3_k_s),
("q4_0", q4_0),
("q4_1", q4_1),
("q4_k_m", q4_k_m),
("q4_k_s", q4_k_s),
("q5_0", q5_0),
("q5_1", q5_1),
("q5_k_m", q5_k_m),
("q5_k_s", q5_k_s),
("q6_k", q6_k),
("q8_0", q8_0),
]
for method, flag in QUANTIZATION_METHODS:
if flag:
qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
!./llama.cpp/quantize {fp16} {qtype} {method}
# @title Start Quantisation
QUANTIZATION_METHODS = [
("q2_k", q2_k),
("q3_k_l", q3_k_l),
("q3_k_m", q3_k_m),
("q3_k_s", q3_k_s),
("q4_0", q4_0),
("q4_1", q4_1),
("q4_k_m", q4_k_m),
("q4_k_s", q4_k_s),
("q5_0", q5_0),
("q5_1", q5_1),
("q5_k_m", q5_k_m),
("q5_k_s", q5_k_s),
("q6_k", q6_k),
("q8_0", q8_0),
]
for method, flag in QUANTIZATION_METHODS:
if flag:
qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
!./llama.cpp/quantize {fp16} {qtype} {method}
Inference using LLama.cpp¶
In [ ]:
Copied!
import os
model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]
prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")
# Verify the chosen method is in the list
if chosen_method not in model_list:
print("Invalid name")
else:
qtype = f"{MODEL_NAME}/{chosen_method}"
!./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"
import os
model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]
prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")
# Verify the chosen method is in the list
if chosen_method not in model_list:
print("Invalid name")
else:
qtype = f"{MODEL_NAME}/{chosen_method}"
!./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"
Inference using ctransformers¶
In [ ]:
Copied!
!pip install ctransformers>=0.2.24
!pip install ctransformers>=0.2.24
In [ ]:
Copied!
from ctransformers import AutoModelForCausalLM
import os
model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]
prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")
# Verify the chosen method is in the list
if chosen_method not in model_list:
print("Invalid name")
else:
qtype = f"{MODEL_NAME}/{chosen_method}"
llm = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id=qtype, model_type="llama", gpu_layers=0)
for text in llm(prompt, stream=True):
print(text, end="", flush=True)
from ctransformers import AutoModelForCausalLM
import os
model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]
prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")
# Verify the chosen method is in the list
if chosen_method not in model_list:
print("Invalid name")
else:
qtype = f"{MODEL_NAME}/{chosen_method}"
llm = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id=qtype, model_type="llama", gpu_layers=0)
for text in llm(prompt, stream=True):
print(text, end="", flush=True)
Pushing to HuggingFace Hub¶
In [ ]:
Copied!
username = "username"# @param {type:"string"}
from huggingface_hub import create_repo, HfApi
api = HfApi()
# Create empty repo
create_repo(
repo_id = f"{username}/{MODEL_NAME}-GGUF",
repo_type="model",
exist_ok=True,
)
# Upload gguf files
api.upload_folder(
folder_path=MODEL_NAME,
repo_id=f"{username}/{MODEL_NAME}-GGUF",
allow_patterns=f"*.gguf",
)
username = "username"# @param {type:"string"}
from huggingface_hub import create_repo, HfApi
api = HfApi()
# Create empty repo
create_repo(
repo_id = f"{username}/{MODEL_NAME}-GGUF",
repo_type="model",
exist_ok=True,
)
# Upload gguf files
api.upload_folder(
folder_path=MODEL_NAME,
repo_id=f"{username}/{MODEL_NAME}-GGUF",
allow_patterns=f"*.gguf",
)
In [ ]:
Copied!
# Upload Tokeniser and other files from the base model
api.upload_folder(
folder_path=local_directory,
repo_id=f"{username}/{MODEL_NAME}-GGUF",
allow_patterns = "*|!*.bin|!*.safetensors",
ignore_patterns="*.bin|*.safetensors"
)
# Upload Tokeniser and other files from the base model
api.upload_folder(
folder_path=local_directory,
repo_id=f"{username}/{MODEL_NAME}-GGUF",
allow_patterns = "*|!*.bin|!*.safetensors",
ignore_patterns="*.bin|*.safetensors"
)