In [ ]:
Copied!
# install from source
# !git clone https://github.com/casper-hansen/AutoAWQ
# %cd AutoAWQ
# !pip install -e .
# %cd ..
# quick install the most stable version
!pip install autoawq -q
# install from source # !git clone https://github.com/casper-hansen/AutoAWQ # %cd AutoAWQ # !pip install -e . # %cd .. # quick install the most stable version !pip install autoawq -q
In [ ]:
Copied!
# install transformers from the source - dev version
!pip install git+https://github.com/huggingface/transformers.git -q
!pip install huggingface_hub
# install transformers from the source - dev version !pip install git+https://github.com/huggingface/transformers.git -q !pip install huggingface_hub
In [ ]:
Copied!
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login notebook_login()
In [ ]:
Copied!
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import torch
model_path = "PY007/TinyLlama-1.1B-Chat-v0.3" #replace with your model path or model id
quant_name = model_path.split("/")[-1] + "-AWQ"
quant_path = "AdithyaSK/" + quant_name
quant_config = {"zero_point" : True, "q_group_size":128,"w_bit":4}
#Load model
model = AutoAWQForCausalLM.from_pretrained(model_path , device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True)
# Quantize
model.quantize(tokenizer,quant_config=quant_config)
model.save_quantized(quant_name, safetensors=True , shard_size="10GB")
tokenizer.save_pretrained(quant_name)
from awq import AutoAWQForCausalLM from transformers import AutoTokenizer import torch model_path = "PY007/TinyLlama-1.1B-Chat-v0.3" #replace with your model path or model id quant_name = model_path.split("/")[-1] + "-AWQ" quant_path = "AdithyaSK/" + quant_name quant_config = {"zero_point" : True, "q_group_size":128,"w_bit":4} #Load model model = AutoAWQForCausalLM.from_pretrained(model_path , device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) # Quantize model.quantize(tokenizer,quant_config=quant_config) model.save_quantized(quant_name, safetensors=True , shard_size="10GB") tokenizer.save_pretrained(quant_name)
Push models and tokenizers to Hub¶
In [ ]:
Copied!
from huggingface_hub import create_repo
repo_id = "AdithyaSK/" + quant_name
create_repo(quant_path , private=False)
from huggingface_hub import create_repo repo_id = "AdithyaSK/" + quant_name create_repo(quant_path , private=False)
In [ ]:
Copied!
from huggingface_hub import HfApi
api = HfApi()
path_in_repo = "model.safetensors"
local_file_path = "./"+ quant_name + "/" + path_in_repo
api.upload_file(
path_or_fileobj = local_file_path,
path_in_repo = path_in_repo,
repo_id = repo_id,
repo_type = "model"
)
from huggingface_hub import HfApi api = HfApi() path_in_repo = "model.safetensors" local_file_path = "./"+ quant_name + "/" + path_in_repo api.upload_file( path_or_fileobj = local_file_path, path_in_repo = path_in_repo, repo_id = repo_id, repo_type = "model" )
Upload non-Model Files¶
In [ ]:
Copied!
from huggingface_hub import HfApi
api = HfApi()
repo_id = "AdithyaSK/"+ quant_name
local_file_paths = [
"./"+ quant_name + "/config.json",
"./"+ quant_name + "/generation_config.json",
"./"+ quant_name + "/quant_config.json",
"./"+ quant_name + "/special_tokens_map.json",
"./"+ quant_name + "/tokenizer_config.json",
"./"+ quant_name + "/tokenizer.json",
]
#Loop thorugh each file and upload
for local_file_path in local_file_paths:
file_name = local_file_path.split("/")[-1]
path_in_repo = file_name
api.upload_file(
path_or_fileobj=local_file_path,
path_in_repo=path_in_repo,
repo_id=repo_id,
repo_type="model"
)
print(f"Uploaded {file_name} to {repo_id}")
from huggingface_hub import HfApi api = HfApi() repo_id = "AdithyaSK/"+ quant_name local_file_paths = [ "./"+ quant_name + "/config.json", "./"+ quant_name + "/generation_config.json", "./"+ quant_name + "/quant_config.json", "./"+ quant_name + "/special_tokens_map.json", "./"+ quant_name + "/tokenizer_config.json", "./"+ quant_name + "/tokenizer.json", ] #Loop thorugh each file and upload for local_file_path in local_file_paths: file_name = local_file_path.split("/")[-1] path_in_repo = file_name api.upload_file( path_or_fileobj=local_file_path, path_in_repo=path_in_repo, repo_id=repo_id, repo_type="model" ) print(f"Uploaded {file_name} to {repo_id}")
Run AWQ Inference with AutoAWQ¶
In [ ]:
Copied!
## Load AWQ Model
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
### Note that the model must be in safetensors formate!
# model_name_or_path = "TheBloke/Llama-2-7b-Chat-AWQ"
model_name_or_path = f"{repo_id}"
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,trust_remote_code = False, safetensors = True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)
## Load AWQ Model from awq import AutoAWQForCausalLM from transformers import AutoTokenizer ### Note that the model must be in safetensors formate! # model_name_or_path = "TheBloke/Llama-2-7b-Chat-AWQ" model_name_or_path = f"{repo_id}" model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,trust_remote_code = False, safetensors = True) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)
In [17]:
Copied!
# ## Load model in bf16
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch
# model_name_or_path = "" # model name
# ## Load model
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16 , trust_remote_code = True , device ="cuda")
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)
# ## Load model in bf16 # from transformers import AutoTokenizer, AutoModelForCausalLM # import torch # model_name_or_path = "" # model name # ## Load model # model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16 , trust_remote_code = True , device ="cuda") # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code = False)
In [ ]:
Copied!
!nvidia-smi
!nvidia-smi
In [ ]:
Copied!
print(torch.cuda.get_device_name())
print(torch.cuda.get_device_name())
In [ ]:
Copied!
import torch
prompt = "Who played the character Iron man?"
fromatted_prompt = f"<|im_start|>users\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
tokens = tokenizer(fromatted_prompt,return_tensors="pt").input_ids.cuda()
# tokens = tokenizer(fromatted_prompt,return_tensors="pt", device ="cuda").input_ids.cuda()
# Generate Output
generation_output = model.generate(tokens,do_sample=False,max_new_tokens=512)
print(tokenizer.decode(generation_output[0],skip_special_tokens=True))
import torch prompt = "Who played the character Iron man?" fromatted_prompt = f"<|im_start|>users\n{prompt}<|im_end|>\n<|im_start|>assistant\n" tokens = tokenizer(fromatted_prompt,return_tensors="pt").input_ids.cuda() # tokens = tokenizer(fromatted_prompt,return_tensors="pt", device ="cuda").input_ids.cuda() # Generate Output generation_output = model.generate(tokens,do_sample=False,max_new_tokens=512) print(tokenizer.decode(generation_output[0],skip_special_tokens=True))