diff --git a/download_model.py b/download_model.py new file mode 100644 index 0000000..cb0655f --- /dev/null +++ b/download_model.py @@ -0,0 +1,37 @@ + + +from inference import Inference +from modelconfig import Modelconfig + + +def main(): + # Model size: 3.21B params + Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)) + + # Model size: 1.24B params + Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True)) + + # Model size: 3.21B params + Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)) + + # Model size: 4.65B params + Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True)) + + # Model size: 3.21B params + Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True)) + + # Model size: 5.21B params + Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True)) + + # Model size: 4.46B params + Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True)) + + # Model size: 3.09B params + Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)) + + # Model size: 3.87B params + Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/inference.py b/inference.py index 9871640..005c74d 100644 --- a/inference.py +++ b/inference.py @@ -17,41 +17,49 @@ import time import utils import re import os +from modelconfig import Modelconfig torch.set_num_threads(os.cpu_count()) # Adjust this to the number of threads/cores you have class Inference: - def __init__(self): - print("loading LLM...") + def __init__(self, modelconfig: Modelconfig): + print("loading LLM '%s'..." % modelconfig.model_name) t_start = time.time() # model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub - model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub + # model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub + # model_name = "unsloth/phi-4-unsloth-bnb-4bit" #too big # model_name = "gpt2" # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B" # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" # "meta-llama/Llama-2-7b-hf" # Replace with your chosen model - quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode - load_in_4bit=True, - bnb_4bit_quant_type="nf4", # Recommended for better performance - bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving - bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation - ) + # quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode + # load_in_4bit=True, + # bnb_4bit_quant_type="nf4", # Recommended for better performance + # bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving + # bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation + # ) - quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True) + # quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True) # Load the model with quantization (optional) - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - # device_map="auto", # Automatically places parts of the model on GPU/CPU - # device_map="cuda", # Automatically places parts of the model on GPU/CPU - device_map="cuda", # Automatically places parts of the model on GPU/CPU - # load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed - quantization_config=quantization_config_8bit - ) + if modelconfig.bits_and_bytes_config != None: + self.model = AutoModelForCausalLM.from_pretrained( + modelconfig.model_name, + # device_map="auto", # Automatically places parts of the model on GPU/CPU + # device_map="cuda", # Automatically places parts of the model on GPU/CPU + device_map="cuda", # Automatically places parts of the model on GPU/CPU + # load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed + quantization_config=modelconfig.bits_and_bytes_config + ) + else: + self.model = AutoModelForCausalLM.from_pretrained( + modelconfig.model_name, + device_map="cuda", + ) # print("apply optimization") # self.model.generation_config.cache_implementation = "static" @@ -59,25 +67,25 @@ class Inference: # Load tokenizer - self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.tokenizer = AutoTokenizer.from_pretrained(modelconfig.model_name) print("load took %.3fs" % (time.time() - t_start)) - max_context_length = self.model.config.max_position_embeddings + self.max_context_length = self.model.config.max_position_embeddings self.tokenizer.chat_template = utils.load_json_file("chat_template.json") - print("max_context_length is %d tokens." % (max_context_length)) + print("max_context_length is %d tokens." % (self.max_context_length)) - def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + def generate(self, input_ids: torch.Tensor, print_stdout=True) -> tuple[torch.Tensor, str]: with torch.inference_mode(): with torch.no_grad(): - return self.generate_incremental_2(input_ids) + return self.generate_incremental_2(input_ids, print_stdout) - def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + def generate_batch(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]: outputs = self.model.generate( input_ids, # **inputs, inputs["input_ids"] max_new_tokens=500, # max_length=max_context_length, @@ -90,11 +98,12 @@ class Inference: # skip all input tokens and only output the additional generated part of the conversation input_token_count = len(input_ids[0]) out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True) - print(out_text) + if print_stdout: + print(out_text) return outputs, out_text - def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + def generate_incremental_2(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]: generated_tokens = input_ids past_key_values = DynamicCache() @@ -126,12 +135,14 @@ class Inference: # Decode and print the newly generated token (skip special tokens) # out_text = self.tokenizer.decode(next_token, skip_special_tokens=True) out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True) - print(out_text, end="", flush=True) # Print without newline + if print_stdout: + print(out_text, end="", flush=True) # Print without newline # Check if the generated token is the end-of-sequence token # if next_token.item() == self.tokenizer.eos_token_id: if new_tokens[-1].item() == self.tokenizer.eos_token_id: - print("") + if print_stdout: + print("") break # n += 1 @@ -150,12 +161,12 @@ class Inference: return generated_tokens, full_output - def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + def generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]: with torch.inference_mode(): - return self._generate_incremental(input_ids) + return self._generate_incremental(input_ids, print_stdout) - def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + def _generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]: # Start with the initial input tokens generated_tokens = input_ids # Initially, this is just the input tokens @@ -183,11 +194,13 @@ class Inference: # Decode and print the newly generated token (skip special tokens) out_text = self.tokenizer.decode(next_token, skip_special_tokens=True) - print(out_text, end="", flush=True) # Print without newline + if print_stdout: + print(out_text, end="", flush=True) # Print without newline # Check if the generated token is the end-of-sequence token if next_token.item() == self.tokenizer.eos_token_id: - print("") + if print_stdout: + print("") break n += 1 diff --git a/inference_profile_test.py b/inference_profile_test.py new file mode 100644 index 0000000..aa45d83 --- /dev/null +++ b/inference_profile_test.py @@ -0,0 +1,76 @@ +from inference import Inference +from modelconfig import Modelconfig +import time +import nvidia_smi +import torch +import gc + + +def empty_cuda(): + while True: + gc.collect() + torch.cuda.empty_cache() + time.sleep(0.5) + vram = nvidia_smi.get_gpu_stats()["memory_used"] + print("vram: %d MB" % vram) + if vram < 200: + return + + +def profile_ex(model_conf: Modelconfig): + print("") + empty_cuda() + messages = [ + {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."}, + {"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"}, + ] + + gpu_stats_before = nvidia_smi.get_gpu_stats() + inference = Inference(model_conf) + + gpu_stats_loaded = nvidia_smi.get_gpu_stats() + t_start = time.time() + input_ids = inference.tokenize(messages, tokenize=True) + generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False) + t_end = time.time() + gpu_stats_after = nvidia_smi.get_gpu_stats() + + took = t_end - t_start + tokens = len(generated_tokens[0]) + tokens_per = tokens / took + vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"] + vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"] + print("model: %s" % model_conf.model_name) + print("tokens: %d tk" % tokens) + print("time: %.3f s" % took) + print("speed: %.3f tk/s" % tokens_per) + print("vram_bulk: %d MB" % vram_bulk) + print("vram_top: %d MB" % vram_top) + print("context: %d tk" % inference.max_context_length) + print("") + + +def profile(model_conf): + try: + profile_ex(model_conf) + except Exception as e: + print("exception: " + str(e)) + pass + + +def main(): + profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)) + profile(Modelconfig("unsloth/Llama-3.2-1B")) + profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)) + profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit")) + # profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)) + profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")) + profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")) + profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)) + profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)) + profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/llama.py b/llama.py index 1266861..d3da9ad 100644 --- a/llama.py +++ b/llama.py @@ -4,7 +4,7 @@ from tool_helper import tool_list, parse_and_execute_tool_call from tool_functions import register_dummy from inference import Inference, torch_reseed import datetime - +from modelconfig import Modelconfig messages = [] @@ -12,6 +12,7 @@ inference = None # systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user." +systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user." # system message for role flip so the model automatically answers for the user roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."} @@ -23,7 +24,7 @@ summarize_user = {"role": "system", "content": "Can you summarize the conversati # system message to create a conversation title title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."} -append_toolcalls = True +append_toolcalls = False register_dummy() @@ -38,6 +39,7 @@ def append_generate_chat(input_text: str, role="user"): messages.append({"role": role, "content": input_text}) inputs = inference.tokenize(messages, tokenize=True) + number_of_input_tokens = inputs.shape[1] outputs, out_text = inference.generate(inputs) @@ -45,7 +47,10 @@ def append_generate_chat(input_text: str, role="user"): messages.append({"role": "assistant", "content": out_text}) print("") - print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0]))) + time_taken = time.time() - t_start + number_of_tokens = len(outputs[0]) + tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken + print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second)) # handle tool call and check if a tool call has happened. tool_result = parse_and_execute_tool_call(out_text, tool_list) @@ -56,20 +61,10 @@ def append_generate_chat(input_text: str, role="user"): append_generate_chat(tool_result, role="tool") - - -def main(): +def terminal_generation_loop(): global messages global inference - inference = Inference() - - current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.") - if append_toolcalls: - messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}] - else: - messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}] - while True: # print an input prompt to receive text or commands input_text = input(">>> ") @@ -173,6 +168,106 @@ def main(): append_generate_chat(input_text) +def main(): + global messages + global inference + + # model: NousResearch/Hermes-3-Llama-3.2-3B + # tokens: 315 tk + # time: 94.360 s + # speed: 3.338 tk/s + # vram_bulk: 3622 MB + # vram_top: 80 MB + # context: 131072 tk + # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True) + + # model: unsloth/Llama-3.2-1B + # tokens: 589 tk + # time: 39.348 s + # speed: 14.969 tk/s + # vram_bulk: 4708 MB + # vram_top: 102 MB + # context: 131072 tk + # model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work. + + # model: unsloth/Llama-3.2-3B-Instruct + # tokens: 285 tk + # time: 75.363 s + # speed: 3.782 tk/s + # vram_bulk: 3512 MB + # vram_top: 48 MB + # context: 131072 tk + # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True) + + # model: unsloth/llama-3-8b-bnb-4bit + # tokens: 435 tk + # time: 84.314 s + # speed: 5.159 tk/s + # vram_bulk: 5440 MB + # vram_top: 216 MB + # context: 8192 tk + # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit") + + # Model size: 3.21B params + # vram used: xxxxx MB + # speed xxxxx t/s + # working: DOES NOT LOAD + # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True) + + # model: unsloth/gemma-2-9b-it-bnb-4bit + # tokens: 154 tk + # time: 32.727 s + # speed: 4.706 tk/s + # vram_bulk: 6156 MB + # vram_top: 232 MB + # context: 8192 tk + # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit") + + # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit + # tokens: 120 tk + # time: 12.248 s + # speed: 9.798 tk/s + # vram_bulk: 5382 MB + # vram_top: 170 MB + # context: 32768 tk + model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good + + # model: unsloth/Qwen2.5-3B-Instruct + # tokens: 112 tk + # time: 12.703 s + # speed: 8.816 tk/s + # vram_bulk: 2108 MB + # vram_top: 98 MB + # context: 32768 tk + # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True) + + # model: unsloth/Qwen2.5-3B-Instruct + # tokens: 118 tk + # time: 33.748 s + # speed: 3.497 tk/s + # vram_bulk: 3310 MB + # vram_top: 60 MB + # context: 32768 tk + # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True) + + # Model size: 3.87B params + # vram used: xxxxx MB + # speed xxxxx t/s + # error: requires the protobuf library but it was not found in your environment + # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit") + + + inference = Inference(model) + + current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.") + if append_toolcalls: + messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}] + else: + messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}] + + terminal_generation_loop() + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/modelconfig.py b/modelconfig.py new file mode 100644 index 0000000..3d51429 --- /dev/null +++ b/modelconfig.py @@ -0,0 +1,20 @@ + +from transformers import BitsAndBytesConfig +import torch + +class Modelconfig: + def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False): + self.model_name = model_name + if load_in_4bit: + assert bits_and_bytes_config == None + self.bits_and_bytes_config = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode + load_in_4bit=True, + bnb_4bit_quant_type="nf4", # Recommended for better performance + bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving + bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation + ) + elif load_in_8bit: + assert bits_and_bytes_config == None + self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True) + else: + self.bits_and_bytes_config = bits_and_bytes_config \ No newline at end of file