Qunatize with AWQ¶

Activation Aware Quantization

This notebook is for you to qunatize huggingface models in AWQ formate and upload them to the Hub

In [ ]:

Copied!

# install from source

# !git clone https://github.com/casper-hansen/AutoAWQ
# %cd AutoAWQ
# !pip install -e .
# %cd ..

# quick install the most stable version
!pip install autoawq -q
# install from source

# !git clone https://github.com/casper-hansen/AutoAWQ
# %cd AutoAWQ
# !pip install -e .
# %cd ..

# quick install the most stable version
!pip install autoawq -q

In [ ]:

Copied!

# install transformers from the source - dev version
!pip install  git+https://github.com/huggingface/transformers.git -q
!pip install huggingface_hub
# install transformers from the source - dev version
!pip install  git+https://github.com/huggingface/transformers.git -q
!pip install huggingface_hub

In [ ]:

Copied!

from huggingface_hub import notebook_login

notebook_login()
from huggingface_hub import notebook_login

notebook_login()

In [ ]:

Copied!

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

import torch

model_path = "PY007/TinyLlama-1.1B-Chat-v0.3" #replace with your model path or model id

quant_name =  model_path.split("/")[-1] + "-AWQ"

quant_path = "AdithyaSK/" + quant_name
quant_config = {"zero_point" : True, "q_group_size":128,"w_bit":4}

#Load model
model = AutoAWQForCausalLM.from_pretrained(model_path , device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True)

# Quantize

model.quantize(tokenizer,quant_config=quant_config)

model.save_quantized(quant_name, safetensors=True , shard_size="10GB")
tokenizer.save_pretrained(quant_name)
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

import torch

model_path = "PY007/TinyLlama-1.1B-Chat-v0.3" #replace with your model path or model id

quant_name =  model_path.split("/")[-1] + "-AWQ"

quant_path = "AdithyaSK/" + quant_name
quant_config = {"zero_point" : True, "q_group_size":128,"w_bit":4}

#Load model
model = AutoAWQForCausalLM.from_pretrained(model_path , device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True)

# Quantize

model.quantize(tokenizer,quant_config=quant_config)

model.save_quantized(quant_name, safetensors=True , shard_size="10GB")
tokenizer.save_pretrained(quant_name)

Push models and tokenizers to Hub¶

In [ ]:

Copied!

from huggingface_hub import create_repo

repo_id = "AdithyaSK/" + quant_name

create_repo(quant_path , private=False)
from huggingface_hub import create_repo

repo_id = "AdithyaSK/" + quant_name

create_repo(quant_path , private=False)

In [ ]:

Copied!





from huggingface_hub import HfApi

api = HfApi()

path_in_repo = "model.safetensors"

local_file_path = "./"+ quant_name + "/" + path_in_repo

api.upload_file(
    path_or_fileobj = local_file_path,
    path_in_repo = path_in_repo,
    repo_id = repo_id,
    repo_type = "model"
)
from huggingface_hub import HfApi

api = HfApi()

path_in_repo = "model.safetensors"

local_file_path = "./"+ quant_name + "/" + path_in_repo

api.upload_file(
    path_or_fileobj = local_file_path,
    path_in_repo = path_in_repo,
    repo_id = repo_id,
    repo_type = "model"
)

Upload non-Model Files¶

In [ ]:

Copied!





from huggingface_hub import HfApi

api = HfApi()

repo_id = "AdithyaSK/"+ quant_name

local_file_paths = [
    "./"+ quant_name + "/config.json",
    "./"+ quant_name + "/generation_config.json",
    "./"+ quant_name + "/quant_config.json",
    "./"+ quant_name + "/special_tokens_map.json",
    "./"+ quant_name + "/tokenizer_config.json",
    "./"+ quant_name + "/tokenizer.json",
]

#Loop thorugh each file and upload
for local_file_path in local_file_paths:
    file_name = local_file_path.split("/")[-1]

    path_in_repo = file_name

    api.upload_file(
        path_or_fileobj=local_file_path,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="model"
    )
    print(f"Uploaded {file_name} to {repo_id}")
from huggingface_hub import HfApi

api = HfApi()

repo_id = "AdithyaSK/"+ quant_name

local_file_paths = [
    "./"+ quant_name + "/config.json",
    "./"+ quant_name + "/generation_config.json",
    "./"+ quant_name + "/quant_config.json",
    "./"+ quant_name + "/special_tokens_map.json",
    "./"+ quant_name + "/tokenizer_config.json",
    "./"+ quant_name + "/tokenizer.json",
]

#Loop thorugh each file and upload
for local_file_path in local_file_paths:
    file_name = local_file_path.split("/")[-1]

    path_in_repo = file_name

    api.upload_file(
        path_or_fileobj=local_file_path,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="model"
    )
    print(f"Uploaded {file_name} to {repo_id}")

Run AWQ Inference with AutoAWQ¶

In [ ]:

Copied!

## Load AWQ Model

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

### Note that the model must be in safetensors formate!

# model_name_or_path = "TheBloke/Llama-2-7b-Chat-AWQ"
model_name_or_path = f"{repo_id}"

model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,trust_remote_code = False, safetensors = True)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)
## Load AWQ Model

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

### Note that the model must be in safetensors formate!

# model_name_or_path = "TheBloke/Llama-2-7b-Chat-AWQ"
model_name_or_path = f"{repo_id}"

model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,trust_remote_code = False, safetensors = True)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)

In [17]:

Copied!

# ## Load model in bf16

# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

# model_name_or_path = "" # model name

# ## Load model
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16 , trust_remote_code = True , device ="cuda")

# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)
# ## Load model in bf16

# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

# model_name_or_path = "" # model name

# ## Load model
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16 , trust_remote_code = True , device ="cuda")

# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)

In [ ]:

Copied!

!nvidia-smi
!nvidia-smi

In [ ]:

Copied!

print(torch.cuda.get_device_name())
print(torch.cuda.get_device_name())

In [ ]:

Copied!

import torch

prompt = "Who played the character Iron man?"

fromatted_prompt = f"<|im_start|>users\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

tokens = tokenizer(fromatted_prompt,return_tensors="pt").input_ids.cuda()
# tokens = tokenizer(fromatted_prompt,return_tensors="pt", device ="cuda").input_ids.cuda()

# Generate Output

generation_output = model.generate(tokens,do_sample=False,max_new_tokens=512)

print(tokenizer.decode(generation_output[0],skip_special_tokens=True))
import torch

prompt = "Who played the character Iron man?"

fromatted_prompt = f"<|im_start|>users\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

tokens = tokenizer(fromatted_prompt,return_tensors="pt").input_ids.cuda()
# tokens = tokenizer(fromatted_prompt,return_tensors="pt", device ="cuda").input_ids.cuda()

# Generate Output

generation_output = model.generate(tokens,do_sample=False,max_new_tokens=512)

print(tokenizer.decode(generation_output[0],skip_special_tokens=True))