Converting HuggingFace Models to GGUF/GGML¶

This notebook is crafted for the purpose of quantizing Hugging Face models into GGUF format and subsequently uploading them to the Hub. Let's

We will be usingLlama.cpp to quantize the model and it supports the following models:

LLaMA 🦙
LLaMA 2 🦙🦙
Falcon
Alpaca
GPT4All
Chinese LLaMA / Alpaca and Chinese LLaMA-2 / Alpaca-2
Vigogne (French)
Vicuna
Koala
OpenBuddy 🐶 (Multilingual)
Pygmalion 7B / Metharme 7B
WizardLM
Baichuan-7B and its derivations (such as baichuan-7b-sft)
Aquila-7B / AquilaChat-7B

This notebook can be run on a free Google Colab CPU/CPU machine
On a CPU machine it took me 10 to 15 minutes to quantize a 7b model. On a GPU machine it took me 2 to 3 minutes to quantize a 7b model.

Converting HuggingFace Models to GGUF/GGML

Download the base model from Huggingface

Quantisation

Inference using LLama.cpp

Inference using ctransformers

Pushing to HuggingFace Hub

In [ ]:

  Copied!     
 
!pip install huggingface_hub
!pip install huggingface_hub

In [ ]:

  Copied!     
 
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login notebook_login()

Download the base model from Huggingface¶

Load the base model you want to quantise to GGUF Formate

In [ ]:

  Copied!     
 
from huggingface_hub import snapshot_download
model_id = "meta-llama/Llama-2-7b-hf" # @param {type:"string"}
local_directory = model_id.split("/")[-1]
snapshot_download(repo_id=model_id,
                  local_dir=local_directory,
                  local_dir_use_symlinks=False,
                  revision="main")
from huggingface_hub import snapshot_download model_id = "meta-llama/Llama-2-7b-hf" # @param {type:"string"} local_directory = model_id.split("/")[-1] snapshot_download(repo_id=model_id, local_dir=local_directory, local_dir_use_symlinks=False, revision="main")

In [ ]:

  Copied!     
 
# @title Installing Llama.cpp
!apt update -y
!apt install build-essential git cmake libopenblas-dev libeigen3-dev -y

!git clone https://github.com/ggerganov/llama.cpp
!pip install -r llama.cpp/requirements.txt
# @title Installing Llama.cpp !apt update -y !apt install build-essential git cmake libopenblas-dev libeigen3-dev -y !git clone https://github.com/ggerganov/llama.cpp !pip install -r llama.cpp/requirements.txt

Quantisation¶

In [ ]:

  Copied!     
 
# @title Choose Quantisation Type. { display-mode: "form" }

# @markdown ### Enter your model and and Huggingface account:
MODEL_NAME = 'quantizeModelName'  # @param {type: "string"}

# @markdown ### Choose Quantisation Formats:
q2_k = False # @param {type:"boolean"}
q3_k_l = False # @param {type:"boolean"}
q3_k_m = False # @param {type:"boolean"}
q3_k_s = False # @param {type:"boolean"}
q4_0 = False # @param {type:"boolean"}
q4_1 = False # @param {type:"boolean"}
q4_k_m = True # @param {type:"boolean"}
q4_k_s = False # @param {type:"boolean"}
q5_0 = False # @param {type:"boolean"}
q5_1 = False # @param {type:"boolean"}
q5_k_m = True # @param {type:"boolean"}
q5_k_s = False # @param {type:"boolean"}
q6_k = False # @param {type:"boolean"}
q8_0 = False # @param {type:"boolean"}

import os

# Check if the directory exists
if not os.path.exists(MODEL_NAME):
    # If it doesn't exist, create it
    os.mkdir(MODEL_NAME)
else:
    print(f"The directory {MODEL_NAME} already exists.")
# @title Choose Quantisation Type. { display-mode: "form" } # @markdown ### Enter your model and and Huggingface account: MODEL_NAME = 'quantizeModelName' # @param {type: "string"} # @markdown ### Choose Quantisation Formats: q2_k = False # @param {type:"boolean"} q3_k_l = False # @param {type:"boolean"} q3_k_m = False # @param {type:"boolean"} q3_k_s = False # @param {type:"boolean"} q4_0 = False # @param {type:"boolean"} q4_1 = False # @param {type:"boolean"} q4_k_m = True # @param {type:"boolean"} q4_k_s = False # @param {type:"boolean"} q5_0 = False # @param {type:"boolean"} q5_1 = False # @param {type:"boolean"} q5_k_m = True # @param {type:"boolean"} q5_k_s = False # @param {type:"boolean"} q6_k = False # @param {type:"boolean"} q8_0 = False # @param {type:"boolean"} import os # Check if the directory exists if not os.path.exists(MODEL_NAME): # If it doesn't exist, create it os.mkdir(MODEL_NAME) else: print(f"The directory {MODEL_NAME} already exists.") 

In [ ]:

  Copied!     
 
# @title Load in 16bit Precision
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {local_directory} --outtype f16 --outfile {fp16}
# @title Load in 16bit Precision fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin" !python llama.cpp/convert.py {local_directory} --outtype f16 --outfile {fp16}

In [ ]:

  Copied!     
 
!cd llama.cpp && make LLAMA_OPENBLAS=1
!cd llama.cpp && make LLAMA_OPENBLAS=1

In [ ]:

  Copied!     
 
# @title Start Quantisation

QUANTIZATION_METHODS = [
    ("q2_k", q2_k),
    ("q3_k_l", q3_k_l),
    ("q3_k_m", q3_k_m),
    ("q3_k_s", q3_k_s),
    ("q4_0", q4_0),
    ("q4_1", q4_1),
    ("q4_k_m", q4_k_m),
    ("q4_k_s", q4_k_s),
    ("q5_0", q5_0),
    ("q5_1", q5_1),
    ("q5_k_m", q5_k_m),
    ("q5_k_s", q5_k_s),
    ("q6_k", q6_k),
    ("q8_0", q8_0),
]

for method, flag in QUANTIZATION_METHODS:
    if flag:
        qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
        !./llama.cpp/quantize {fp16} {qtype} {method}
# @title Start Quantisation QUANTIZATION_METHODS = [ ("q2_k", q2_k), ("q3_k_l", q3_k_l), ("q3_k_m", q3_k_m), ("q3_k_s", q3_k_s), ("q4_0", q4_0), ("q4_1", q4_1), ("q4_k_m", q4_k_m), ("q4_k_s", q4_k_s), ("q5_0", q5_0), ("q5_1", q5_1), ("q5_k_m", q5_k_m), ("q5_k_s", q5_k_s), ("q6_k", q6_k), ("q8_0", q8_0), ] for method, flag in QUANTIZATION_METHODS: if flag: qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf" !./llama.cpp/quantize {fp16} {qtype} {method}

Inference using LLama.cpp¶

In [ ]:

  Copied!     
 
import os

model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]

prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid name")
else:
    qtype = f"{MODEL_NAME}/{chosen_method}"
    !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"
import os model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file] prompt = input("Enter your prompt: ") chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ") # Verify the chosen method is in the list if chosen_method not in model_list: print("Invalid name") else: qtype = f"{MODEL_NAME}/{chosen_method}" !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"

Inference using ctransformers¶

In [ ]:

  Copied!     
 
!pip install ctransformers>=0.2.24
!pip install ctransformers>=0.2.24

In [ ]:

  Copied!     
 
from ctransformers import AutoModelForCausalLM
import os

model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]

prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid name")
else:
    qtype = f"{MODEL_NAME}/{chosen_method}"
    llm = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id=qtype, model_type="llama", gpu_layers=0)

for text in llm(prompt, stream=True):
    print(text, end="", flush=True)
from ctransformers import AutoModelForCausalLM import os model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file] prompt = input("Enter your prompt: ") chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ") # Verify the chosen method is in the list if chosen_method not in model_list: print("Invalid name") else: qtype = f"{MODEL_NAME}/{chosen_method}" llm = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id=qtype, model_type="llama", gpu_layers=0) for text in llm(prompt, stream=True): print(text, end="", flush=True) 

Pushing to HuggingFace Hub¶

In [ ]:

  Copied!     
 
username = "username"# @param {type:"string"}
from huggingface_hub import create_repo, HfApi

api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{username}/{MODEL_NAME}-GGUF",
    allow_patterns=f"*.gguf",
)
username = "username"# @param {type:"string"} from huggingface_hub import create_repo, HfApi api = HfApi() # Create empty repo create_repo( repo_id = f"{username}/{MODEL_NAME}-GGUF", repo_type="model", exist_ok=True, ) # Upload gguf files api.upload_folder( folder_path=MODEL_NAME, repo_id=f"{username}/{MODEL_NAME}-GGUF", allow_patterns=f"*.gguf", ) 

In [ ]:

  Copied!     
 
# Upload Tokeniser and other files from the base model
api.upload_folder(
    folder_path=local_directory,
    repo_id=f"{username}/{MODEL_NAME}-GGUF",
    allow_patterns = "*|!*.bin|!*.safetensors",
    ignore_patterns="*.bin|*.safetensors"
)
# Upload Tokeniser and other files from the base model api.upload_folder( folder_path=local_directory, repo_id=f"{username}/{MODEL_NAME}-GGUF", allow_patterns = "*|!*.bin|!*.safetensors", ignore_patterns="*.bin|*.safetensors" )