Browse Source

try out some more models

master
Florin Tobler 5 months ago
parent
commit
19870cdea8
  1. 37
      download_model.py
  2. 79
      inference.py
  3. 76
      inference_profile_test.py
  4. 123
      llama.py
  5. 20
      modelconfig.py

37
download_model.py

@ -0,0 +1,37 @@
from inference import Inference
from modelconfig import Modelconfig
def main():
# Model size: 3.21B params
Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
# Model size: 1.24B params
Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True))
# Model size: 3.21B params
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
# Model size: 4.65B params
Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True))
# Model size: 3.21B params
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True))
# Model size: 5.21B params
Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True))
# Model size: 4.46B params
Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True))
# Model size: 3.09B params
Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
# Model size: 3.87B params
Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True))
if __name__ == "__main__":
main()

79
inference.py

@ -17,41 +17,49 @@ import time
import utils
import re
import os
from modelconfig import Modelconfig
torch.set_num_threads(os.cpu_count()) # Adjust this to the number of threads/cores you have
class Inference:
def __init__(self):
print("loading LLM...")
def __init__(self, modelconfig: Modelconfig):
print("loading LLM '%s'..." % modelconfig.model_name)
t_start = time.time()
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
# model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
# model_name = "unsloth/phi-4-unsloth-bnb-4bit" #too big
# model_name = "gpt2"
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # Recommended for better performance
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
)
# quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4", # Recommended for better performance
# bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
# bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
# )
quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
# Load the model with quantization (optional)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
# device_map="auto", # Automatically places parts of the model on GPU/CPU
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
device_map="cuda", # Automatically places parts of the model on GPU/CPU
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
quantization_config=quantization_config_8bit
)
if modelconfig.bits_and_bytes_config != None:
self.model = AutoModelForCausalLM.from_pretrained(
modelconfig.model_name,
# device_map="auto", # Automatically places parts of the model on GPU/CPU
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
device_map="cuda", # Automatically places parts of the model on GPU/CPU
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
quantization_config=modelconfig.bits_and_bytes_config
)
else:
self.model = AutoModelForCausalLM.from_pretrained(
modelconfig.model_name,
device_map="cuda",
)
# print("apply optimization")
# self.model.generation_config.cache_implementation = "static"
@ -59,25 +67,25 @@ class Inference:
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(modelconfig.model_name)
print("load took %.3fs" % (time.time() - t_start))
max_context_length = self.model.config.max_position_embeddings
self.max_context_length = self.model.config.max_position_embeddings
self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
print("max_context_length is %d tokens." % (max_context_length))
print("max_context_length is %d tokens." % (self.max_context_length))
def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
def generate(self, input_ids: torch.Tensor, print_stdout=True) -> tuple[torch.Tensor, str]:
with torch.inference_mode():
with torch.no_grad():
return self.generate_incremental_2(input_ids)
return self.generate_incremental_2(input_ids, print_stdout)
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
def generate_batch(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
outputs = self.model.generate(
input_ids, # **inputs, inputs["input_ids"]
max_new_tokens=500, # max_length=max_context_length,
@ -90,11 +98,12 @@ class Inference:
# skip all input tokens and only output the additional generated part of the conversation
input_token_count = len(input_ids[0])
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
print(out_text)
if print_stdout:
print(out_text)
return outputs, out_text
def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
def generate_incremental_2(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
generated_tokens = input_ids
past_key_values = DynamicCache()
@ -126,12 +135,14 @@ class Inference:
# Decode and print the newly generated token (skip special tokens)
# out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
print(out_text, end="", flush=True) # Print without newline
if print_stdout:
print(out_text, end="", flush=True) # Print without newline
# Check if the generated token is the end-of-sequence token
# if next_token.item() == self.tokenizer.eos_token_id:
if new_tokens[-1].item() == self.tokenizer.eos_token_id:
print("")
if print_stdout:
print("")
break
# n += 1
@ -150,12 +161,12 @@ class Inference:
return generated_tokens, full_output
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
def generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
with torch.inference_mode():
return self._generate_incremental(input_ids)
return self._generate_incremental(input_ids, print_stdout)
def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
def _generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
# Start with the initial input tokens
generated_tokens = input_ids # Initially, this is just the input tokens
@ -183,11 +194,13 @@ class Inference:
# Decode and print the newly generated token (skip special tokens)
out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
print(out_text, end="", flush=True) # Print without newline
if print_stdout:
print(out_text, end="", flush=True) # Print without newline
# Check if the generated token is the end-of-sequence token
if next_token.item() == self.tokenizer.eos_token_id:
print("")
if print_stdout:
print("")
break
n += 1

76
inference_profile_test.py

@ -0,0 +1,76 @@
from inference import Inference
from modelconfig import Modelconfig
import time
import nvidia_smi
import torch
import gc
def empty_cuda():
while True:
gc.collect()
torch.cuda.empty_cache()
time.sleep(0.5)
vram = nvidia_smi.get_gpu_stats()["memory_used"]
print("vram: %d MB" % vram)
if vram < 200:
return
def profile_ex(model_conf: Modelconfig):
print("")
empty_cuda()
messages = [
{"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."},
{"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"},
]
gpu_stats_before = nvidia_smi.get_gpu_stats()
inference = Inference(model_conf)
gpu_stats_loaded = nvidia_smi.get_gpu_stats()
t_start = time.time()
input_ids = inference.tokenize(messages, tokenize=True)
generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False)
t_end = time.time()
gpu_stats_after = nvidia_smi.get_gpu_stats()
took = t_end - t_start
tokens = len(generated_tokens[0])
tokens_per = tokens / took
vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"]
vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"]
print("model: %s" % model_conf.model_name)
print("tokens: %d tk" % tokens)
print("time: %.3f s" % took)
print("speed: %.3f tk/s" % tokens_per)
print("vram_bulk: %d MB" % vram_bulk)
print("vram_top: %d MB" % vram_top)
print("context: %d tk" % inference.max_context_length)
print("")
def profile(model_conf):
try:
profile_ex(model_conf)
except Exception as e:
print("exception: " + str(e))
pass
def main():
profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
profile(Modelconfig("unsloth/Llama-3.2-1B"))
profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit"))
# profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True))
profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit"))
profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit"))
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True))
profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit"))
if __name__ == "__main__":
main()

123
llama.py

@ -4,7 +4,7 @@ from tool_helper import tool_list, parse_and_execute_tool_call
from tool_functions import register_dummy
from inference import Inference, torch_reseed
import datetime
from modelconfig import Modelconfig
messages = []
@ -12,6 +12,7 @@ inference = None
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user."
# system message for role flip so the model automatically answers for the user
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
@ -23,7 +24,7 @@ summarize_user = {"role": "system", "content": "Can you summarize the conversati
# system message to create a conversation title
title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
append_toolcalls = True
append_toolcalls = False
register_dummy()
@ -38,6 +39,7 @@ def append_generate_chat(input_text: str, role="user"):
messages.append({"role": role, "content": input_text})
inputs = inference.tokenize(messages, tokenize=True)
number_of_input_tokens = inputs.shape[1]
outputs, out_text = inference.generate(inputs)
@ -45,7 +47,10 @@ def append_generate_chat(input_text: str, role="user"):
messages.append({"role": "assistant", "content": out_text})
print("")
print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0])))
time_taken = time.time() - t_start
number_of_tokens = len(outputs[0])
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
# handle tool call and check if a tool call has happened.
tool_result = parse_and_execute_tool_call(out_text, tool_list)
@ -56,20 +61,10 @@ def append_generate_chat(input_text: str, role="user"):
append_generate_chat(tool_result, role="tool")
def main():
def terminal_generation_loop():
global messages
global inference
inference = Inference()
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
if append_toolcalls:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
else:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]
while True:
# print an input prompt to receive text or commands
input_text = input(">>> ")
@ -173,6 +168,106 @@ def main():
append_generate_chat(input_text)
def main():
global messages
global inference
# model: NousResearch/Hermes-3-Llama-3.2-3B
# tokens: 315 tk
# time: 94.360 s
# speed: 3.338 tk/s
# vram_bulk: 3622 MB
# vram_top: 80 MB
# context: 131072 tk
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
# model: unsloth/Llama-3.2-1B
# tokens: 589 tk
# time: 39.348 s
# speed: 14.969 tk/s
# vram_bulk: 4708 MB
# vram_top: 102 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work.
# model: unsloth/Llama-3.2-3B-Instruct
# tokens: 285 tk
# time: 75.363 s
# speed: 3.782 tk/s
# vram_bulk: 3512 MB
# vram_top: 48 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
# model: unsloth/llama-3-8b-bnb-4bit
# tokens: 435 tk
# time: 84.314 s
# speed: 5.159 tk/s
# vram_bulk: 5440 MB
# vram_top: 216 MB
# context: 8192 tk
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
# Model size: 3.21B params
# vram used: xxxxx MB
# speed xxxxx t/s
# working: DOES NOT LOAD
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
# model: unsloth/gemma-2-9b-it-bnb-4bit
# tokens: 154 tk
# time: 32.727 s
# speed: 4.706 tk/s
# vram_bulk: 6156 MB
# vram_top: 232 MB
# context: 8192 tk
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
# tokens: 120 tk
# time: 12.248 s
# speed: 9.798 tk/s
# vram_bulk: 5382 MB
# vram_top: 170 MB
# context: 32768 tk
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 112 tk
# time: 12.703 s
# speed: 8.816 tk/s
# vram_bulk: 2108 MB
# vram_top: 98 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 118 tk
# time: 33.748 s
# speed: 3.497 tk/s
# vram_bulk: 3310 MB
# vram_top: 60 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
# Model size: 3.87B params
# vram used: xxxxx MB
# speed xxxxx t/s
# error: requires the protobuf library but it was not found in your environment
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
inference = Inference(model)
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
if append_toolcalls:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
else:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]
terminal_generation_loop()
if __name__ == "__main__":
main()

20
modelconfig.py

@ -0,0 +1,20 @@
from transformers import BitsAndBytesConfig
import torch
class Modelconfig:
def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False):
self.model_name = model_name
if load_in_4bit:
assert bits_and_bytes_config == None
self.bits_and_bytes_config = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # Recommended for better performance
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
)
elif load_in_8bit:
assert bits_and_bytes_config == None
self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True)
else:
self.bits_and_bytes_config = bits_and_bytes_config
Loading…
Cancel
Save