refactoring

try out some more models
switch for toolcalls
2025-01-13 22:39:32 +01:00 · 2025-01-13 20:47:48 +01:00 · 2025-01-12 21:16:42 +01:00
8 changed files with 485 additions and 194 deletions
--- a/download_model.py
+++ b/download_model.py
@@ -0,0 +1,37 @@
 from inference import Inference
 from modelconfig import Modelconfig
 def main():
    # Model size: 3.21B params
    Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
    # Model size: 1.24B params
    Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True))
    # Model size: 3.21B params
    Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
    # Model size: 4.65B params
    Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True))
    # Model size: 3.21B params
    Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True))
    # Model size: 5.21B params
    Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True))
    # Model size: 4.46B params
    Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True))
    # Model size: 3.09B params
    Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
    # Model size: 3.87B params
    Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True))
 if __name__ == "__main__":
    main()
--- a/generation_loop.py
+++ b/generation_loop.py
@@ -0,0 +1,181 @@
 import time
 import json
 import random
 from tool_helper import tool_list, parse_and_execute_tool_call
 from inference import Inference, torch_reseed
 def check_append_file(prompt: str) -> str:
    if prompt.startswith("@"):
        prompt = prompt[1:]  # Remove the '@'
        filename = prompt.split(" ")[0]
        try:
            with open(filename, "r") as f:
                content = f.read()
                return "'''%s'''\n\n%s" % (content, prompt)
        except:
            print(f"File '{filename}' not found.")
    return prompt
 def msg(role: str, content: str) -> dict:
    return {"role": role, "content": content}
 class Terminal:
    def __init__(self, inference: Inference, systemmessage: dict):
        self.inference = inference
        self.messages:list[dict] = [systemmessage]
        # these are meant to be overwritten by better ones
        self.roleflip = msg("system", "keep going.")
        self.summarize = msg("system", "summarize conversation")
        self.summarize_user = msg("system", "please summarize conversation")
        self.title_prompt = msg("system", "create a title for this conversation")
    def append_generate_chat(self, input_text: str, role="user"):
        t_start = time.time()
        # generate AI response
        if input_text != None:
            self.messages.append({"role": role, "content": input_text})
        inputs = self.inference.tokenize(self.messages, tokenize=True)
        number_of_input_tokens = inputs.shape[1]
        outputs, out_text = self.inference.generate(inputs)
        # append result to message history
        self.messages.append({"role": "assistant", "content": out_text})
        print("")
        time_taken = time.time() - t_start
        number_of_tokens = len(outputs[0])
        tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
        print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
        # handle tool call and check if a tool call has happened.
        tool_result = parse_and_execute_tool_call(out_text, tool_list)
        if tool_result != None:
            # tool call happened
            tool_result = "<tool_response>%s</tool_response>" % tool_result
            # depending on the chat template the tool response tags must or must not be passed. :(
            self.append_generate_chat(tool_result, role="tool")
    def join(self):
        while True:
            # print an input prompt to receive text or commands
            input_text = input(">>> ")
            print("")
            input_text = check_append_file(input_text)
            if input_text.startswith("!"):
                self.append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
                # append_generate_chat("%s" % input_text[1:], role="tool")  # depending on the chat template the tool response tags must or must not be passed. :(
            elif input_text.startswith("/clear"):
                print("clearing chat history")
                start_msg = self.messages[0]
                self.message = [start_msg]
                print("")
            elif input_text.startswith("/history"):
                history = self.inference.tokenize(self.message, tokenize=False)
                # history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False)
                print(history)
            elif input_text.startswith("/undo"):
                if len(self.message) > 2:
                    print("undo latest prompt")
                    self.message = self.message[:-2]
                else:
                    print("cannot undo because there are not enough self.message on history.")
                print("")
            elif input_text.startswith("/regen"):
                if len(self.message) >= 2:
                    print("regenerating message (not working)")
                    self.message = self.message[:-1]
                    seed = random.randint(0, 2**32 - 1)  # Generate a random seed
                    torch_reseed(seed)
                    self.append_generate_chat(None)
                else:
                    print("cannot regenerate because there are not enough self.message on history.")
                print("")
            elif input_text.startswith("/more"):
                self.append_generate_chat(None)
            elif input_text.startswith("/file"):
                filename = input_text[len("/file "):]
                print("read '%s' for prompt:" % filename)
                with open(filename, "r") as f:
                    content = f.read()
                print(content)
                self.append_generate_chat(content)
            elif input_text.startswith("/auto"):
                message_backup = self.message
                self.message = [self.roleflip]
                for m in self.message_backup:
                    role = m["role"]
                    content = m["content"]
                    if role == "user":
                        role = "assistant"
                    elif role == "assistant":
                        role = "user"
                    if role != "system":
                        self.message.append({"role": role, "content": content})
                self.append_generate_chat(None)  # will automatically advance the conversation as 'user'
                last_message = self.messages[-1]
                last_message["role"] = "user"
                self.messages = message_backup + [last_message]
                self.append_generate_chat(None)  # 'regular' chatbot answer
            elif input_text.startswith("/summarize"):
                messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
                messages_temp = [self.summarize] + messages_temp + [self.summarize_user]  # copy dict in last instance
                # messages_temp[-1]["role"] = "user"
                input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
                generated_tokens, full_output = self.inference.generate(input_ids)
            elif input_text.startswith("/title"):
                messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
                messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance
                messages_temp[-1]["role"] = "user"
                input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
                generated_tokens, full_output = self.inference.generate(input_ids)
            elif input_text.startswith("/save"):
                with open("messages.json", "w") as f:
                    json.dump(self.messages, f, indent=4)
            elif input_text.startswith("/load"):
                with open("messages.json", "r") as f:
                    new_messages = json.load(f)
                    messages = [self.messages[0]] + new_messages[1:]
            elif input_text.startswith("/help"):
                print("!<prompt>   answer as 'tool' in <tool_response> tags")
                print("/clear      clear chat history")
                print("/undo       undo latest prompt")
                print("/regen      regenerate the last message")
                print("/more       generate more additional information")
                print("/file       read prompt input from file")
                print("/auto       automatically advance conversation")
                print("/summarize  generate a summary of the chat")
                print("/title      generate a title of the chat")
                print("/save       write chat history to file")
                print("/load       load previously saved history")
                print("/help       print this message")
                print("")
            elif input_text.startswith("/"):
                print("unknown command.")
            else:
                self.append_generate_chat(input_text)
--- a/inference.py
+++ b/inference.py
@@ -17,41 +17,49 @@ import time
 import utils
 import re
 import os
 from modelconfig import Modelconfig
 torch.set_num_threads(os.cpu_count())  # Adjust this to the number of threads/cores you have
 class Inference:
-    def __init__(self):
+    def __init__(self, modelconfig: Modelconfig):
-        print("loading LLM...")
+        print("loading LLM '%s'..." % modelconfig.model_name)
        t_start = time.time()
        # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
-        model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+        # model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
        # model_name = "unsloth/phi-4-unsloth-bnb-4bit" #too big
        # model_name = "gpt2"
        # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
        # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
        # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
-        quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
+        # quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
-            load_in_4bit=True,
+        #     load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",  # Recommended for better performance
+        #     bnb_4bit_quant_type="nf4",  # Recommended for better performance
-            bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
+        #     bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
-            bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
+        #     bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
-        )
+        # )
-        quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
+        # quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
        # Load the model with quantization (optional)
-        self.model = AutoModelForCausalLM.from_pretrained(
+        if modelconfig.bits_and_bytes_config != None:
-            model_name,
+            self.model = AutoModelForCausalLM.from_pretrained(
-            # device_map="auto",  # Automatically places parts of the model on GPU/CPU
+                modelconfig.model_name,
-            # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+                # device_map="auto",  # Automatically places parts of the model on GPU/CPU
-            device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+                # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
-            # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
+                device_map="cuda",  # Automatically places parts of the model on GPU/CPU
-            quantization_config=quantization_config_8bit
+                # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
-        )
+                quantization_config=modelconfig.bits_and_bytes_config
            )
        else:
            self.model = AutoModelForCausalLM.from_pretrained(
                modelconfig.model_name,
                device_map="cuda",
            )
        # print("apply optimization")
        # self.model.generation_config.cache_implementation = "static"
@@ -59,25 +67,25 @@ class Inference:
        # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(modelconfig.model_name)
        print("load took %.3fs" % (time.time() - t_start))
-        max_context_length = self.model.config.max_position_embeddings
+        self.max_context_length = self.model.config.max_position_embeddings
        self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
-        print("max_context_length is %d tokens." % (max_context_length))
+        print("max_context_length is %d tokens." % (self.max_context_length))
-    def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+    def generate(self, input_ids: torch.Tensor, print_stdout=True) -> tuple[torch.Tensor, str]:
        with torch.inference_mode():
            with torch.no_grad():
-                return self.generate_incremental_2(input_ids)
+                return self.generate_incremental_2(input_ids, print_stdout)
-    def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+    def generate_batch(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
        outputs = self.model.generate(
            input_ids,  # **inputs, inputs["input_ids"]
            max_new_tokens=500,  # max_length=max_context_length,
@@ -90,11 +98,12 @@ class Inference:
        # skip all input tokens and only output the additional generated part of the conversation
        input_token_count = len(input_ids[0])
        out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
-        print(out_text)
+        if print_stdout:
            print(out_text)
        return outputs, out_text
-    def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+    def generate_incremental_2(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
        generated_tokens = input_ids
        past_key_values = DynamicCache()
@@ -126,12 +135,14 @@ class Inference:
                # Decode and print the newly generated token (skip special tokens)
                # out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
                out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
-                print(out_text, end="", flush=True)  # Print without newline
+                if print_stdout:
                    print(out_text, end="", flush=True)  # Print without newline
                # Check if the generated token is the end-of-sequence token
                # if next_token.item() == self.tokenizer.eos_token_id:
                if new_tokens[-1].item() == self.tokenizer.eos_token_id:
-                    print("")
+                    if print_stdout:
                        print("")
                    break
                # n += 1
@@ -150,12 +161,12 @@ class Inference:
        return generated_tokens, full_output
-    def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+    def generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
        with torch.inference_mode():
-            return self._generate_incremental(input_ids)
+            return self._generate_incremental(input_ids, print_stdout)
-    def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+    def _generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
        # Start with the initial input tokens
        generated_tokens = input_ids  # Initially, this is just the input tokens
@@ -183,11 +194,13 @@ class Inference:
                # Decode and print the newly generated token (skip special tokens)
                out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
-                print(out_text, end="", flush=True)  # Print without newline
+                if print_stdout:
                    print(out_text, end="", flush=True)  # Print without newline
                # Check if the generated token is the end-of-sequence token
                if next_token.item() == self.tokenizer.eos_token_id:
-                    print("")
+                    if print_stdout:
                        print("")
                    break
                n += 1
--- a/inference_profile_test.py
+++ b/inference_profile_test.py
@@ -0,0 +1,76 @@
 from inference import Inference
 from modelconfig import Modelconfig
 import time
 import nvidia_smi
 import torch
 import gc
 def empty_cuda():
    while True:
        gc.collect()
        torch.cuda.empty_cache()
        time.sleep(0.5) 
        vram = nvidia_smi.get_gpu_stats()["memory_used"]
        print("vram: %d MB" % vram)
        if vram < 200:
             return
 def profile_ex(model_conf: Modelconfig):
        print("")
        empty_cuda()
        messages = [
            {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."},
            {"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"},
        ]
        gpu_stats_before = nvidia_smi.get_gpu_stats()
        inference = Inference(model_conf)
        gpu_stats_loaded = nvidia_smi.get_gpu_stats()
        t_start = time.time()
        input_ids = inference.tokenize(messages, tokenize=True)
        generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False)
        t_end = time.time()
        gpu_stats_after = nvidia_smi.get_gpu_stats()
        took = t_end - t_start
        tokens = len(generated_tokens[0])
        tokens_per = tokens / took
        vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"]
        vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"]
        print("model: %s" % model_conf.model_name)
        print("tokens: %d tk" % tokens)
        print("time: %.3f s" % took)
        print("speed: %.3f tk/s" % tokens_per)
        print("vram_bulk: %d MB" % vram_bulk)
        print("vram_top: %d MB" % vram_top)
        print("context: %d tk" % inference.max_context_length)
        print("")
 def profile(model_conf):
    try:
        profile_ex(model_conf)
    except Exception as e:
        print("exception: " + str(e))
        pass
 def main():
    profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
    profile(Modelconfig("unsloth/Llama-3.2-1B"))
    profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
    profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit"))
    # profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True))
    profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit"))
    profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit"))
    profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
    profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True))
    profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit"))
 if __name__ == "__main__":
    main()
--- a/llama.py
+++ b/llama.py
@@ -1,175 +1,43 @@
-import time
+
-import random
+from tool_helper import tool_list
 from tool_helper import tool_list, parse_and_execute_tool_call
 from tool_functions import register_dummy
-from inference import Inference, torch_reseed
+from inference import Inference
 import datetime
-
+import model_selection
-
+from generation_loop import Terminal, msg
 messages = []
 inference = None
 # systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
 systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
 # system message for role flip so the model automatically answers for the user
 roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
 # system messages and user message to bring the model to summarize the entire conversation
 summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."}
 summarize_user = {"role": "system", "content": "Can you summarize the conversation?"}
 # system message to create a conversation title
 title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
 register_dummy()
 def initialize_config(inference: Inference) -> Terminal:
-
+    # systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
-def append_generate_chat(input_text: str, role="user"):
+    system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information."
    t_start = time.time()
    # generate AI response
    if input_text != None:
        messages.append({"role": role, "content": input_text})
    inputs = inference.tokenize(messages, tokenize=True)
    outputs, out_text = inference.generate(inputs)
    # append result to message history
    messages.append({"role": "assistant", "content": out_text})
    print("")
    print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0])))
    # handle tool call and check if a tool call has happened.
    tool_result = parse_and_execute_tool_call(out_text, tool_list)
    if tool_result != None:
        # tool call happened
        tool_result = "<tool_response>%s</tool_response>" % tool_result
        # depending on the chat template the tool response tags must or must not be passed. :(
        append_generate_chat(tool_result, role="tool")
 def main():
    global messages
    global inference
    inference = Inference()
    current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
-    messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
+    append_toolcalls = False
    if append_toolcalls:
        systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list))
    else:
        systemmessage = msg("system", system_prompt + "\n" + current_date_and_time)
-    while True:
+    terminal = Terminal(inference, systemmessage)
        # print an input prompt to receive text or commands
        input_text = input(">>> ")
        print("")
-
+    # system message for role flip so the model automatically answers for the user
-        if input_text.startswith("!"):
+    terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.")
-            append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
+    
-            # append_generate_chat("%s" % input_text[1:], role="tool")  # depending on the chat template the tool response tags must or must not be passed. :(
+    # system messages and user message to bring the model to summarize the entire conversation
-
+    terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.")
-        elif input_text.startswith("/clear"):
+    terminal.summarize_user = msg("system", "Can you summarize the conversation?")
-            print("clearing chat history")
+    
-            start_msg = messages[0]
+    # system message to create a conversation title
-            messages = [start_msg]
+    terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.")
-            print("")
+    return terminal
        elif input_text.startswith("/history"):
            history = inference.tokenize(messages, tokenize=False)
            # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
            print(history)
        elif input_text.startswith("/undo"):
            if len(messages) > 2:
                print("undo latest prompt")
                messages = messages[:-2]
            else:
                print("cannot undo because there are not enough messages on history.")
            print("")
        elif input_text.startswith("/regen"):
            if len(messages) >= 2:
                print("regenerating message (not working)")
                messages = messages[:-1]
                seed = random.randint(0, 2**32 - 1)  # Generate a random seed
                torch_reseed(seed)
                append_generate_chat(None)
            else:
                print("cannot regenerate because there are not enough messages on history.")
            print("")
        elif input_text.startswith("/more"):
            append_generate_chat(None)
        elif input_text.startswith("/file"):
            filename = input_text[len("/file "):]
            print("read '%s' for prompt:" % filename)
            with open(filename, "r") as f:
                content = f.read()
            print(content)
            append_generate_chat(content)
        elif input_text.startswith("/auto"):
            messages_backup = messages
            messages = [roleflip]
            for m in messages_backup:
                role = m["role"]
                content = m["content"]
                if role == "user":
                    role = "assistant"
                elif role == "assistant":
                    role = "user"
                if role != "system":
                    messages.append({"role": role, "content": content})
            append_generate_chat(None)  # will automatically advance the conversation as 'user'
            last_message = messages[-1]
            last_message["role"] = "user"
            messages = messages_backup + [last_message]
            append_generate_chat(None)  # 'regular' chatbot answer
        elif input_text.startswith("/summarize"):
            messages_temp = list(filter(lambda x: x["role"] != "system", messages))
            messages_temp = [summarize] + messages_temp + [summarize_user]  # copy dict in last instance
            # messages_temp[-1]["role"] = "user"
            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
            generated_tokens, full_output = inference.generate(input_ids)
        elif input_text.startswith("/title"):
            messages_temp = list(filter(lambda x: x["role"] != "system", messages))
            messages_temp = [title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance
            messages_temp[-1]["role"] = "user"
            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
            generated_tokens, full_output = inference.generate(input_ids)
        elif input_text.startswith("/help"):
            print("!<prompt>   answer as 'tool' in <tool_response> tags")
            print("/clear      clear chat history")
            print("/undo       undo latest prompt")
            print("/regen      regenerate the last message")
            print("/more       generate more additional information")
            print("/file       read prompt input from file")
            print("/auto       automatically advance conversation")
            print("/summarize  generate a summary of the chat")
            print("/title      generate a title of the chat")
            print("/help       print this message")
            print("")
        elif input_text.startswith("/"):
            print("unknown command.")
        else:
            append_generate_chat(input_text)
 if __name__ == "__main__":
-    main()
+    
    inference = Inference(model_selection.get_model())
    terminal = initialize_config(inference)
    terminal.join()
--- a/model_selection.py
+++ b/model_selection.py
@@ -0,0 +1,95 @@
 from modelconfig import Modelconfig
 def get_model() -> Modelconfig:
    # model: NousResearch/Hermes-3-Llama-3.2-3B
    # tokens: 315 tk
    # time: 94.360 s
    # speed: 3.338 tk/s
    # vram_bulk: 3622 MB
    # vram_top: 80 MB
    # context: 131072 tk
    # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
    # model: unsloth/Llama-3.2-1B
    # tokens: 589 tk
    # time: 39.348 s
    # speed: 14.969 tk/s
    # vram_bulk: 4708 MB
    # vram_top: 102 MB
    # context: 131072 tk
    # model = Modelconfig("unsloth/Llama-3.2-1B")  # note, fast, but talks to itself. basically does not work.
    # model: unsloth/Llama-3.2-3B-Instruct
    # tokens: 285 tk
    # time: 75.363 s
    # speed: 3.782 tk/s
    # vram_bulk: 3512 MB
    # vram_top: 48 MB
    # context: 131072 tk
    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
    # model: unsloth/llama-3-8b-bnb-4bit
    # tokens: 435 tk
    # time: 84.314 s
    # speed: 5.159 tk/s
    # vram_bulk: 5440 MB
    # vram_top: 216 MB
    # context: 8192 tk
    # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
    # Model size: 3.21B params
    # vram used: xxxxx MB
    # speed xxxxx t/s
    # working: DOES NOT LOAD
    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
    # model: unsloth/gemma-2-9b-it-bnb-4bit
    # tokens: 154 tk
    # time: 32.727 s
    # speed: 4.706 tk/s
    # vram_bulk: 6156 MB
    # vram_top: 232 MB
    # context: 8192 tk
    # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
    # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
    # tokens: 120 tk
    # time: 12.248 s
    # speed: 9.798 tk/s
    # vram_bulk: 5382 MB
    # vram_top: 170 MB
    # context: 32768 tk
    model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")  # note, this works really good
    # model: unsloth/Qwen2.5-3B-Instruct
    # tokens: 112 tk
    # time: 12.703 s
    # speed: 8.816 tk/s
    # vram_bulk: 2108 MB
    # vram_top: 98 MB
    # context: 32768 tk
    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
    # model: unsloth/Qwen2.5-3B-Instruct
    # tokens: 118 tk
    # time: 33.748 s
    # speed: 3.497 tk/s
    # vram_bulk: 3310 MB
    # vram_top: 60 MB
    # context: 32768 tk
    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
    # Model size: 3.87B params
    # vram used: xxxxx MB
    # speed xxxxx t/s
    # error: requires the protobuf library but it was not found in your environment
    # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
    return model
--- a/modelconfig.py
+++ b/modelconfig.py
@@ -0,0 +1,20 @@
 from transformers import BitsAndBytesConfig
 import torch
 class Modelconfig:
    def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False):
        self.model_name = model_name
        if load_in_4bit:
            assert bits_and_bytes_config == None
            self.bits_and_bytes_config = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",  # Recommended for better performance
                bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
                bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
            )
        elif load_in_8bit:
            assert bits_and_bytes_config == None
            self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True)
        else:
            self.bits_and_bytes_config = bits_and_bytes_config
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 transformers
 accelerate
-bitsandbytes
+bitsandbytes
 pytest
Author	SHA1	Message	Date
Florin Tobler	7f0cb49156	refactoring	2025-01-13 22:39:32 +01:00
Florin Tobler	19870cdea8	try out some more models	2025-01-13 20:47:48 +01:00
Florin Tobler	677eb6d0ea	switch for toolcalls	2025-01-12 21:16:42 +01:00