refactoring

2025-01-13 22:39:32 +01:00
parent 19870cdea8
commit 7f0cb49156
4 changed files with 305 additions and 258 deletions
--- a/generation_loop.py
+++ b/generation_loop.py
@@ -0,0 +1,181 @@
+import time
+import json
+import random
+from tool_helper import tool_list, parse_and_execute_tool_call
+from inference import Inference, torch_reseed
+
+
+def check_append_file(prompt: str) -> str:
+    if prompt.startswith("@"):
+        prompt = prompt[1:]  # Remove the '@'
+        filename = prompt.split(" ")[0]
+        try:
+            with open(filename, "r") as f:
+                content = f.read()
+                return "'''%s'''\n\n%s" % (content, prompt)
+        except:
+            print(f"File '{filename}' not found.")
+    return prompt
+
+
+def msg(role: str, content: str) -> dict:
+    return {"role": role, "content": content}
+
+
+class Terminal:
+
+    def __init__(self, inference: Inference, systemmessage: dict):
+        self.inference = inference
+        self.messages:list[dict] = [systemmessage]
+
+        # these are meant to be overwritten by better ones
+        self.roleflip = msg("system", "keep going.")
+        self.summarize = msg("system", "summarize conversation")
+        self.summarize_user = msg("system", "please summarize conversation")
+        self.title_prompt = msg("system", "create a title for this conversation")
+
+    def append_generate_chat(self, input_text: str, role="user"):
+        t_start = time.time()
+
+        # generate AI response
+        if input_text != None:
+            self.messages.append({"role": role, "content": input_text})
+
+        inputs = self.inference.tokenize(self.messages, tokenize=True)
+        number_of_input_tokens = inputs.shape[1]
+
+        outputs, out_text = self.inference.generate(inputs)
+
+        # append result to message history
+        self.messages.append({"role": "assistant", "content": out_text})
+
+        print("")
+        time_taken = time.time() - t_start
+        number_of_tokens = len(outputs[0])
+        tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
+        print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
+
+        # handle tool call and check if a tool call has happened.
+        tool_result = parse_and_execute_tool_call(out_text, tool_list)
+        if tool_result != None:
+            # tool call happened
+            tool_result = "<tool_response>%s</tool_response>" % tool_result
+            # depending on the chat template the tool response tags must or must not be passed. :(
+            self.append_generate_chat(tool_result, role="tool")
+
+    def join(self):
+
+        while True:
+            # print an input prompt to receive text or commands
+            input_text = input(">>> ")
+            print("")
+
+            input_text = check_append_file(input_text)
+
+
+            if input_text.startswith("!"):
+                self.append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
+                # append_generate_chat("%s" % input_text[1:], role="tool")  # depending on the chat template the tool response tags must or must not be passed. :(
+
+            elif input_text.startswith("/clear"):
+                print("clearing chat history")
+                start_msg = self.messages[0]
+                self.message = [start_msg]
+                print("")
+
+            elif input_text.startswith("/history"):
+                history = self.inference.tokenize(self.message, tokenize=False)
+                # history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False)
+                print(history)
+
+            elif input_text.startswith("/undo"):
+                if len(self.message) > 2:
+                    print("undo latest prompt")
+                    self.message = self.message[:-2]
+                else:
+                    print("cannot undo because there are not enough self.message on history.")
+                print("")
+
+            elif input_text.startswith("/regen"):
+                if len(self.message) >= 2:
+                    print("regenerating message (not working)")
+                    self.message = self.message[:-1]
+                    seed = random.randint(0, 2**32 - 1)  # Generate a random seed
+                    torch_reseed(seed)
+                    self.append_generate_chat(None)
+                else:
+                    print("cannot regenerate because there are not enough self.message on history.")
+                print("")
+
+            elif input_text.startswith("/more"):
+                self.append_generate_chat(None)
+
+            elif input_text.startswith("/file"):
+                filename = input_text[len("/file "):]
+                print("read '%s' for prompt:" % filename)
+                with open(filename, "r") as f:
+                    content = f.read()
+                print(content)
+                self.append_generate_chat(content)
+
+            elif input_text.startswith("/auto"):
+                message_backup = self.message
+                self.message = [self.roleflip]
+                for m in self.message_backup:
+                    role = m["role"]
+                    content = m["content"]
+                    if role == "user":
+                        role = "assistant"
+                    elif role == "assistant":
+                        role = "user"
+                    if role != "system":
+                        self.message.append({"role": role, "content": content})
+                self.append_generate_chat(None)  # will automatically advance the conversation as 'user'
+                last_message = self.messages[-1]
+                last_message["role"] = "user"
+                self.messages = message_backup + [last_message]
+                self.append_generate_chat(None)  # 'regular' chatbot answer
+
+            elif input_text.startswith("/summarize"):
+                messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
+                messages_temp = [self.summarize] + messages_temp + [self.summarize_user]  # copy dict in last instance
+                # messages_temp[-1]["role"] = "user"
+                input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
+                generated_tokens, full_output = self.inference.generate(input_ids)
+
+            elif input_text.startswith("/title"):
+                messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
+                messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance
+                messages_temp[-1]["role"] = "user"
+                input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
+                generated_tokens, full_output = self.inference.generate(input_ids)
+
+            elif input_text.startswith("/save"):
+                with open("messages.json", "w") as f:
+                    json.dump(self.messages, f, indent=4)
+                
+            elif input_text.startswith("/load"):
+                with open("messages.json", "r") as f:
+                    new_messages = json.load(f)
+                    messages = [self.messages[0]] + new_messages[1:]
+
+            elif input_text.startswith("/help"):
+                print("!<prompt>   answer as 'tool' in <tool_response> tags")
+                print("/clear      clear chat history")
+                print("/undo       undo latest prompt")
+                print("/regen      regenerate the last message")
+                print("/more       generate more additional information")
+                print("/file       read prompt input from file")
+                print("/auto       automatically advance conversation")
+                print("/summarize  generate a summary of the chat")
+                print("/title      generate a title of the chat")
+                print("/save       write chat history to file")
+                print("/load       load previously saved history")
+                print("/help       print this message")
+                print("")
+
+            elif input_text.startswith("/"):
+                print("unknown command.")
+
+            else:
+                self.append_generate_chat(input_text)
--- a/llama.py
+++ b/llama.py
@@ -1,273 +1,43 @@
-import time
-import random
-from tool_helper import tool_list, parse_and_execute_tool_call
+
+from tool_helper import tool_list
 from tool_functions import register_dummy
-from inference import Inference, torch_reseed
+from inference import Inference
 import datetime
-from modelconfig import Modelconfig
+import model_selection
+from generation_loop import Terminal, msg


-messages = []
-inference = None
-
-# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
-systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
-systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user."
-
-# system message for role flip so the model automatically answers for the user
-roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
-
-# system messages and user message to bring the model to summarize the entire conversation
-summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."}
-summarize_user = {"role": "system", "content": "Can you summarize the conversation?"}
-
-# system message to create a conversation title
-title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
-
-append_toolcalls = False
-
 register_dummy()


+def initialize_config(inference: Inference) -> Terminal:

-
-def append_generate_chat(input_text: str, role="user"):
-    t_start = time.time()
-
-    # generate AI response
-    if input_text != None:
-        messages.append({"role": role, "content": input_text})
-
-    inputs = inference.tokenize(messages, tokenize=True)
-    number_of_input_tokens = inputs.shape[1]
-
-    outputs, out_text = inference.generate(inputs)
-
-    # append result to message history
-    messages.append({"role": "assistant", "content": out_text})
-
-    print("")
-    time_taken = time.time() - t_start
-    number_of_tokens = len(outputs[0])
-    tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
-    print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
-
-    # handle tool call and check if a tool call has happened.
-    tool_result = parse_and_execute_tool_call(out_text, tool_list)
-    if tool_result != None:
-        # tool call happened
-        tool_result = "<tool_response>%s</tool_response>" % tool_result
-        # depending on the chat template the tool response tags must or must not be passed. :(
-        append_generate_chat(tool_result, role="tool")
-
-
-def terminal_generation_loop():
-    global messages
-    global inference
-
-    while True:
-        # print an input prompt to receive text or commands
-        input_text = input(">>> ")
-        print("")
-
-
-        if input_text.startswith("!"):
-            append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
-            # append_generate_chat("%s" % input_text[1:], role="tool")  # depending on the chat template the tool response tags must or must not be passed. :(
-
-        elif input_text.startswith("/clear"):
-            print("clearing chat history")
-            start_msg = messages[0]
-            messages = [start_msg]
-            print("")
-
-        elif input_text.startswith("/history"):
-            history = inference.tokenize(messages, tokenize=False)
-            # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
-            print(history)
-
-        elif input_text.startswith("/undo"):
-            if len(messages) > 2:
-                print("undo latest prompt")
-                messages = messages[:-2]
-            else:
-                print("cannot undo because there are not enough messages on history.")
-            print("")
-
-        elif input_text.startswith("/regen"):
-            if len(messages) >= 2:
-                print("regenerating message (not working)")
-                messages = messages[:-1]
-                seed = random.randint(0, 2**32 - 1)  # Generate a random seed
-                torch_reseed(seed)
-                append_generate_chat(None)
-            else:
-                print("cannot regenerate because there are not enough messages on history.")
-            print("")
-
-        elif input_text.startswith("/more"):
-            append_generate_chat(None)
-
-        elif input_text.startswith("/file"):
-            filename = input_text[len("/file "):]
-            print("read '%s' for prompt:" % filename)
-            with open(filename, "r") as f:
-                content = f.read()
-            print(content)
-            append_generate_chat(content)
-
-        elif input_text.startswith("/auto"):
-            messages_backup = messages
-            messages = [roleflip]
-            for m in messages_backup:
-                role = m["role"]
-                content = m["content"]
-                if role == "user":
-                    role = "assistant"
-                elif role == "assistant":
-                    role = "user"
-                if role != "system":
-                    messages.append({"role": role, "content": content})
-            append_generate_chat(None)  # will automatically advance the conversation as 'user'
-            last_message = messages[-1]
-            last_message["role"] = "user"
-            messages = messages_backup + [last_message]
-            append_generate_chat(None)  # 'regular' chatbot answer
-
-        elif input_text.startswith("/summarize"):
-            messages_temp = list(filter(lambda x: x["role"] != "system", messages))
-            messages_temp = [summarize] + messages_temp + [summarize_user]  # copy dict in last instance
-            # messages_temp[-1]["role"] = "user"
-            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
-            generated_tokens, full_output = inference.generate(input_ids)
-
-        elif input_text.startswith("/title"):
-            messages_temp = list(filter(lambda x: x["role"] != "system", messages))
-            messages_temp = [title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance
-            messages_temp[-1]["role"] = "user"
-            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
-            generated_tokens, full_output = inference.generate(input_ids)
-
-        elif input_text.startswith("/help"):
-            print("!<prompt>   answer as 'tool' in <tool_response> tags")
-            print("/clear      clear chat history")
-            print("/undo       undo latest prompt")
-            print("/regen      regenerate the last message")
-            print("/more       generate more additional information")
-            print("/file       read prompt input from file")
-            print("/auto       automatically advance conversation")
-            print("/summarize  generate a summary of the chat")
-            print("/title      generate a title of the chat")
-            print("/help       print this message")
-            print("")
-
-        elif input_text.startswith("/"):
-            print("unknown command.")
-
-        else:
-            append_generate_chat(input_text)
-
-
-def main():
-    global messages
-    global inference
-
-    # model: NousResearch/Hermes-3-Llama-3.2-3B
-    # tokens: 315 tk
-    # time: 94.360 s
-    # speed: 3.338 tk/s
-    # vram_bulk: 3622 MB
-    # vram_top: 80 MB
-    # context: 131072 tk
-    # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
-
-    # model: unsloth/Llama-3.2-1B
-    # tokens: 589 tk
-    # time: 39.348 s
-    # speed: 14.969 tk/s
-    # vram_bulk: 4708 MB
-    # vram_top: 102 MB
-    # context: 131072 tk
-    # model = Modelconfig("unsloth/Llama-3.2-1B")  # note, fast, but talks to itself. basically does not work.
-
-    # model: unsloth/Llama-3.2-3B-Instruct
-    # tokens: 285 tk
-    # time: 75.363 s
-    # speed: 3.782 tk/s
-    # vram_bulk: 3512 MB
-    # vram_top: 48 MB
-    # context: 131072 tk
-    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
-
-    # model: unsloth/llama-3-8b-bnb-4bit
-    # tokens: 435 tk
-    # time: 84.314 s
-    # speed: 5.159 tk/s
-    # vram_bulk: 5440 MB
-    # vram_top: 216 MB
-    # context: 8192 tk
-    # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
-
-    # Model size: 3.21B params
-    # vram used: xxxxx MB
-    # speed xxxxx t/s
-    # working: DOES NOT LOAD
-    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
-
-    # model: unsloth/gemma-2-9b-it-bnb-4bit
-    # tokens: 154 tk
-    # time: 32.727 s
-    # speed: 4.706 tk/s
-    # vram_bulk: 6156 MB
-    # vram_top: 232 MB
-    # context: 8192 tk
-    # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
-
-    # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
-    # tokens: 120 tk
-    # time: 12.248 s
-    # speed: 9.798 tk/s
-    # vram_bulk: 5382 MB
-    # vram_top: 170 MB
-    # context: 32768 tk
-    model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")  # note, this works really good
-
-    # model: unsloth/Qwen2.5-3B-Instruct
-    # tokens: 112 tk
-    # time: 12.703 s
-    # speed: 8.816 tk/s
-    # vram_bulk: 2108 MB
-    # vram_top: 98 MB
-    # context: 32768 tk
-    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
-
-    # model: unsloth/Qwen2.5-3B-Instruct
-    # tokens: 118 tk
-    # time: 33.748 s
-    # speed: 3.497 tk/s
-    # vram_bulk: 3310 MB
-    # vram_top: 60 MB
-    # context: 32768 tk
-    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
-
-    # Model size: 3.87B params
-    # vram used: xxxxx MB
-    # speed xxxxx t/s
-    # error: requires the protobuf library but it was not found in your environment
-    # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
-
-
-    inference = Inference(model)
-
+    # systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
+    system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information."
    current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
+    append_toolcalls = False
    if append_toolcalls:
-        messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
+        systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list))
    else:
-        messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]
+        systemmessage = msg("system", system_prompt + "\n" + current_date_and_time)

-    terminal_generation_loop()
+    terminal = Terminal(inference, systemmessage)
+
+    # system message for role flip so the model automatically answers for the user
+    terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.")
+    
+    # system messages and user message to bring the model to summarize the entire conversation
+    terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.")
+    terminal.summarize_user = msg("system", "Can you summarize the conversation?")
+    
+    # system message to create a conversation title
+    terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.")
+    return terminal



 if __name__ == "__main__":
-    main()
+    
+    inference = Inference(model_selection.get_model())
+    terminal = initialize_config(inference)
+    terminal.join()
--- a/model_selection.py
+++ b/model_selection.py
@@ -0,0 +1,95 @@
+
+from modelconfig import Modelconfig
+
+
+
+def get_model() -> Modelconfig:
+    
+    # model: NousResearch/Hermes-3-Llama-3.2-3B
+    # tokens: 315 tk
+    # time: 94.360 s
+    # speed: 3.338 tk/s
+    # vram_bulk: 3622 MB
+    # vram_top: 80 MB
+    # context: 131072 tk
+    # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
+
+    # model: unsloth/Llama-3.2-1B
+    # tokens: 589 tk
+    # time: 39.348 s
+    # speed: 14.969 tk/s
+    # vram_bulk: 4708 MB
+    # vram_top: 102 MB
+    # context: 131072 tk
+    # model = Modelconfig("unsloth/Llama-3.2-1B")  # note, fast, but talks to itself. basically does not work.
+
+    # model: unsloth/Llama-3.2-3B-Instruct
+    # tokens: 285 tk
+    # time: 75.363 s
+    # speed: 3.782 tk/s
+    # vram_bulk: 3512 MB
+    # vram_top: 48 MB
+    # context: 131072 tk
+    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
+
+    # model: unsloth/llama-3-8b-bnb-4bit
+    # tokens: 435 tk
+    # time: 84.314 s
+    # speed: 5.159 tk/s
+    # vram_bulk: 5440 MB
+    # vram_top: 216 MB
+    # context: 8192 tk
+    # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
+
+    # Model size: 3.21B params
+    # vram used: xxxxx MB
+    # speed xxxxx t/s
+    # working: DOES NOT LOAD
+    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
+
+    # model: unsloth/gemma-2-9b-it-bnb-4bit
+    # tokens: 154 tk
+    # time: 32.727 s
+    # speed: 4.706 tk/s
+    # vram_bulk: 6156 MB
+    # vram_top: 232 MB
+    # context: 8192 tk
+    # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
+
+    # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
+    # tokens: 120 tk
+    # time: 12.248 s
+    # speed: 9.798 tk/s
+    # vram_bulk: 5382 MB
+    # vram_top: 170 MB
+    # context: 32768 tk
+    model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")  # note, this works really good
+
+    # model: unsloth/Qwen2.5-3B-Instruct
+    # tokens: 112 tk
+    # time: 12.703 s
+    # speed: 8.816 tk/s
+    # vram_bulk: 2108 MB
+    # vram_top: 98 MB
+    # context: 32768 tk
+    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
+
+    # model: unsloth/Qwen2.5-3B-Instruct
+    # tokens: 118 tk
+    # time: 33.748 s
+    # speed: 3.497 tk/s
+    # vram_bulk: 3310 MB
+    # vram_top: 60 MB
+    # context: 32768 tk
+    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
+
+    # Model size: 3.87B params
+    # vram used: xxxxx MB
+    # speed xxxxx t/s
+    # error: requires the protobuf library but it was not found in your environment
+    # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
+
+    return model
+
+
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 transformers
 accelerate
-bitsandbytes
+bitsandbytes
+pytest