From 7f0cb4915671933a7bf3d301aa617d81d9c4d1e0 Mon Sep 17 00:00:00 2001 From: Florin Tobler Date: Mon, 13 Jan 2025 22:39:32 +0100 Subject: [PATCH] refactoring --- generation_loop.py | 181 +++++++++++++++++++++++++++++ llama.py | 284 +++++---------------------------------------- model_selection.py | 95 +++++++++++++++ requirements.txt | 3 +- 4 files changed, 305 insertions(+), 258 deletions(-) create mode 100644 generation_loop.py create mode 100644 model_selection.py diff --git a/generation_loop.py b/generation_loop.py new file mode 100644 index 0000000..fcfc331 --- /dev/null +++ b/generation_loop.py @@ -0,0 +1,181 @@ +import time +import json +import random +from tool_helper import tool_list, parse_and_execute_tool_call +from inference import Inference, torch_reseed + + +def check_append_file(prompt: str) -> str: + if prompt.startswith("@"): + prompt = prompt[1:] # Remove the '@' + filename = prompt.split(" ")[0] + try: + with open(filename, "r") as f: + content = f.read() + return "'''%s'''\n\n%s" % (content, prompt) + except: + print(f"File '{filename}' not found.") + return prompt + + +def msg(role: str, content: str) -> dict: + return {"role": role, "content": content} + + +class Terminal: + + def __init__(self, inference: Inference, systemmessage: dict): + self.inference = inference + self.messages:list[dict] = [systemmessage] + + # these are meant to be overwritten by better ones + self.roleflip = msg("system", "keep going.") + self.summarize = msg("system", "summarize conversation") + self.summarize_user = msg("system", "please summarize conversation") + self.title_prompt = msg("system", "create a title for this conversation") + + def append_generate_chat(self, input_text: str, role="user"): + t_start = time.time() + + # generate AI response + if input_text != None: + self.messages.append({"role": role, "content": input_text}) + + inputs = self.inference.tokenize(self.messages, tokenize=True) + number_of_input_tokens = inputs.shape[1] + + outputs, out_text = self.inference.generate(inputs) + + # append result to message history + self.messages.append({"role": "assistant", "content": out_text}) + + print("") + time_taken = time.time() - t_start + number_of_tokens = len(outputs[0]) + tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken + print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second)) + + # handle tool call and check if a tool call has happened. + tool_result = parse_and_execute_tool_call(out_text, tool_list) + if tool_result != None: + # tool call happened + tool_result = "%s" % tool_result + # depending on the chat template the tool response tags must or must not be passed. :( + self.append_generate_chat(tool_result, role="tool") + + def join(self): + + while True: + # print an input prompt to receive text or commands + input_text = input(">>> ") + print("") + + input_text = check_append_file(input_text) + + + if input_text.startswith("!"): + self.append_generate_chat("%s" % input_text[1:], role="tool") + # append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :( + + elif input_text.startswith("/clear"): + print("clearing chat history") + start_msg = self.messages[0] + self.message = [start_msg] + print("") + + elif input_text.startswith("/history"): + history = self.inference.tokenize(self.message, tokenize=False) + # history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False) + print(history) + + elif input_text.startswith("/undo"): + if len(self.message) > 2: + print("undo latest prompt") + self.message = self.message[:-2] + else: + print("cannot undo because there are not enough self.message on history.") + print("") + + elif input_text.startswith("/regen"): + if len(self.message) >= 2: + print("regenerating message (not working)") + self.message = self.message[:-1] + seed = random.randint(0, 2**32 - 1) # Generate a random seed + torch_reseed(seed) + self.append_generate_chat(None) + else: + print("cannot regenerate because there are not enough self.message on history.") + print("") + + elif input_text.startswith("/more"): + self.append_generate_chat(None) + + elif input_text.startswith("/file"): + filename = input_text[len("/file "):] + print("read '%s' for prompt:" % filename) + with open(filename, "r") as f: + content = f.read() + print(content) + self.append_generate_chat(content) + + elif input_text.startswith("/auto"): + message_backup = self.message + self.message = [self.roleflip] + for m in self.message_backup: + role = m["role"] + content = m["content"] + if role == "user": + role = "assistant" + elif role == "assistant": + role = "user" + if role != "system": + self.message.append({"role": role, "content": content}) + self.append_generate_chat(None) # will automatically advance the conversation as 'user' + last_message = self.messages[-1] + last_message["role"] = "user" + self.messages = message_backup + [last_message] + self.append_generate_chat(None) # 'regular' chatbot answer + + elif input_text.startswith("/summarize"): + messages_temp = list(filter(lambda x: x["role"] != "system", self.messages)) + messages_temp = [self.summarize] + messages_temp + [self.summarize_user] # copy dict in last instance + # messages_temp[-1]["role"] = "user" + input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ") + generated_tokens, full_output = self.inference.generate(input_ids) + + elif input_text.startswith("/title"): + messages_temp = list(filter(lambda x: x["role"] != "system", self.messages)) + messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance + messages_temp[-1]["role"] = "user" + input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ") + generated_tokens, full_output = self.inference.generate(input_ids) + + elif input_text.startswith("/save"): + with open("messages.json", "w") as f: + json.dump(self.messages, f, indent=4) + + elif input_text.startswith("/load"): + with open("messages.json", "r") as f: + new_messages = json.load(f) + messages = [self.messages[0]] + new_messages[1:] + + elif input_text.startswith("/help"): + print("! answer as 'tool' in tags") + print("/clear clear chat history") + print("/undo undo latest prompt") + print("/regen regenerate the last message") + print("/more generate more additional information") + print("/file read prompt input from file") + print("/auto automatically advance conversation") + print("/summarize generate a summary of the chat") + print("/title generate a title of the chat") + print("/save write chat history to file") + print("/load load previously saved history") + print("/help print this message") + print("") + + elif input_text.startswith("/"): + print("unknown command.") + + else: + self.append_generate_chat(input_text) \ No newline at end of file diff --git a/llama.py b/llama.py index d3da9ad..551d318 100644 --- a/llama.py +++ b/llama.py @@ -1,273 +1,43 @@ -import time -import random -from tool_helper import tool_list, parse_and_execute_tool_call + +from tool_helper import tool_list from tool_functions import register_dummy -from inference import Inference, torch_reseed +from inference import Inference import datetime -from modelconfig import Modelconfig - - -messages = [] -inference = None - -# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions -systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user." -systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user." - -# system message for role flip so the model automatically answers for the user -roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."} - -# system messages and user message to bring the model to summarize the entire conversation -summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."} -summarize_user = {"role": "system", "content": "Can you summarize the conversation?"} +import model_selection +from generation_loop import Terminal, msg -# system message to create a conversation title -title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."} - -append_toolcalls = False register_dummy() +def initialize_config(inference: Inference) -> Terminal: - -def append_generate_chat(input_text: str, role="user"): - t_start = time.time() - - # generate AI response - if input_text != None: - messages.append({"role": role, "content": input_text}) - - inputs = inference.tokenize(messages, tokenize=True) - number_of_input_tokens = inputs.shape[1] - - outputs, out_text = inference.generate(inputs) - - # append result to message history - messages.append({"role": "assistant", "content": out_text}) - - print("") - time_taken = time.time() - t_start - number_of_tokens = len(outputs[0]) - tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken - print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second)) - - # handle tool call and check if a tool call has happened. - tool_result = parse_and_execute_tool_call(out_text, tool_list) - if tool_result != None: - # tool call happened - tool_result = "%s" % tool_result - # depending on the chat template the tool response tags must or must not be passed. :( - append_generate_chat(tool_result, role="tool") - - -def terminal_generation_loop(): - global messages - global inference - - while True: - # print an input prompt to receive text or commands - input_text = input(">>> ") - print("") - - - if input_text.startswith("!"): - append_generate_chat("%s" % input_text[1:], role="tool") - # append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :( - - elif input_text.startswith("/clear"): - print("clearing chat history") - start_msg = messages[0] - messages = [start_msg] - print("") - - elif input_text.startswith("/history"): - history = inference.tokenize(messages, tokenize=False) - # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False) - print(history) - - elif input_text.startswith("/undo"): - if len(messages) > 2: - print("undo latest prompt") - messages = messages[:-2] - else: - print("cannot undo because there are not enough messages on history.") - print("") - - elif input_text.startswith("/regen"): - if len(messages) >= 2: - print("regenerating message (not working)") - messages = messages[:-1] - seed = random.randint(0, 2**32 - 1) # Generate a random seed - torch_reseed(seed) - append_generate_chat(None) - else: - print("cannot regenerate because there are not enough messages on history.") - print("") - - elif input_text.startswith("/more"): - append_generate_chat(None) - - elif input_text.startswith("/file"): - filename = input_text[len("/file "):] - print("read '%s' for prompt:" % filename) - with open(filename, "r") as f: - content = f.read() - print(content) - append_generate_chat(content) - - elif input_text.startswith("/auto"): - messages_backup = messages - messages = [roleflip] - for m in messages_backup: - role = m["role"] - content = m["content"] - if role == "user": - role = "assistant" - elif role == "assistant": - role = "user" - if role != "system": - messages.append({"role": role, "content": content}) - append_generate_chat(None) # will automatically advance the conversation as 'user' - last_message = messages[-1] - last_message["role"] = "user" - messages = messages_backup + [last_message] - append_generate_chat(None) # 'regular' chatbot answer - - elif input_text.startswith("/summarize"): - messages_temp = list(filter(lambda x: x["role"] != "system", messages)) - messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance - # messages_temp[-1]["role"] = "user" - input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ") - generated_tokens, full_output = inference.generate(input_ids) - - elif input_text.startswith("/title"): - messages_temp = list(filter(lambda x: x["role"] != "system", messages)) - messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance - messages_temp[-1]["role"] = "user" - input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ") - generated_tokens, full_output = inference.generate(input_ids) - - elif input_text.startswith("/help"): - print("! answer as 'tool' in tags") - print("/clear clear chat history") - print("/undo undo latest prompt") - print("/regen regenerate the last message") - print("/more generate more additional information") - print("/file read prompt input from file") - print("/auto automatically advance conversation") - print("/summarize generate a summary of the chat") - print("/title generate a title of the chat") - print("/help print this message") - print("") - - elif input_text.startswith("/"): - print("unknown command.") - - else: - append_generate_chat(input_text) - - -def main(): - global messages - global inference - - # model: NousResearch/Hermes-3-Llama-3.2-3B - # tokens: 315 tk - # time: 94.360 s - # speed: 3.338 tk/s - # vram_bulk: 3622 MB - # vram_top: 80 MB - # context: 131072 tk - # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True) - - # model: unsloth/Llama-3.2-1B - # tokens: 589 tk - # time: 39.348 s - # speed: 14.969 tk/s - # vram_bulk: 4708 MB - # vram_top: 102 MB - # context: 131072 tk - # model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work. - - # model: unsloth/Llama-3.2-3B-Instruct - # tokens: 285 tk - # time: 75.363 s - # speed: 3.782 tk/s - # vram_bulk: 3512 MB - # vram_top: 48 MB - # context: 131072 tk - # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True) - - # model: unsloth/llama-3-8b-bnb-4bit - # tokens: 435 tk - # time: 84.314 s - # speed: 5.159 tk/s - # vram_bulk: 5440 MB - # vram_top: 216 MB - # context: 8192 tk - # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit") - - # Model size: 3.21B params - # vram used: xxxxx MB - # speed xxxxx t/s - # working: DOES NOT LOAD - # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True) - - # model: unsloth/gemma-2-9b-it-bnb-4bit - # tokens: 154 tk - # time: 32.727 s - # speed: 4.706 tk/s - # vram_bulk: 6156 MB - # vram_top: 232 MB - # context: 8192 tk - # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit") - - # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit - # tokens: 120 tk - # time: 12.248 s - # speed: 9.798 tk/s - # vram_bulk: 5382 MB - # vram_top: 170 MB - # context: 32768 tk - model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good - - # model: unsloth/Qwen2.5-3B-Instruct - # tokens: 112 tk - # time: 12.703 s - # speed: 8.816 tk/s - # vram_bulk: 2108 MB - # vram_top: 98 MB - # context: 32768 tk - # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True) - - # model: unsloth/Qwen2.5-3B-Instruct - # tokens: 118 tk - # time: 33.748 s - # speed: 3.497 tk/s - # vram_bulk: 3310 MB - # vram_top: 60 MB - # context: 32768 tk - # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True) - - # Model size: 3.87B params - # vram used: xxxxx MB - # speed xxxxx t/s - # error: requires the protobuf library but it was not found in your environment - # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit") - - - inference = Inference(model) - + # systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions + system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information." current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.") + append_toolcalls = False if append_toolcalls: - messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}] + systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)) else: - messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}] + systemmessage = msg("system", system_prompt + "\n" + current_date_and_time) + + terminal = Terminal(inference, systemmessage) - terminal_generation_loop() + # system message for role flip so the model automatically answers for the user + terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.") + + # system messages and user message to bring the model to summarize the entire conversation + terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.") + terminal.summarize_user = msg("system", "Can you summarize the conversation?") + + # system message to create a conversation title + terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.") + return terminal if __name__ == "__main__": - main() \ No newline at end of file + + inference = Inference(model_selection.get_model()) + terminal = initialize_config(inference) + terminal.join() \ No newline at end of file diff --git a/model_selection.py b/model_selection.py new file mode 100644 index 0000000..fa6e9d3 --- /dev/null +++ b/model_selection.py @@ -0,0 +1,95 @@ + +from modelconfig import Modelconfig + + + +def get_model() -> Modelconfig: + + # model: NousResearch/Hermes-3-Llama-3.2-3B + # tokens: 315 tk + # time: 94.360 s + # speed: 3.338 tk/s + # vram_bulk: 3622 MB + # vram_top: 80 MB + # context: 131072 tk + # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True) + + # model: unsloth/Llama-3.2-1B + # tokens: 589 tk + # time: 39.348 s + # speed: 14.969 tk/s + # vram_bulk: 4708 MB + # vram_top: 102 MB + # context: 131072 tk + # model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work. + + # model: unsloth/Llama-3.2-3B-Instruct + # tokens: 285 tk + # time: 75.363 s + # speed: 3.782 tk/s + # vram_bulk: 3512 MB + # vram_top: 48 MB + # context: 131072 tk + # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True) + + # model: unsloth/llama-3-8b-bnb-4bit + # tokens: 435 tk + # time: 84.314 s + # speed: 5.159 tk/s + # vram_bulk: 5440 MB + # vram_top: 216 MB + # context: 8192 tk + # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit") + + # Model size: 3.21B params + # vram used: xxxxx MB + # speed xxxxx t/s + # working: DOES NOT LOAD + # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True) + + # model: unsloth/gemma-2-9b-it-bnb-4bit + # tokens: 154 tk + # time: 32.727 s + # speed: 4.706 tk/s + # vram_bulk: 6156 MB + # vram_top: 232 MB + # context: 8192 tk + # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit") + + # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit + # tokens: 120 tk + # time: 12.248 s + # speed: 9.798 tk/s + # vram_bulk: 5382 MB + # vram_top: 170 MB + # context: 32768 tk + model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good + + # model: unsloth/Qwen2.5-3B-Instruct + # tokens: 112 tk + # time: 12.703 s + # speed: 8.816 tk/s + # vram_bulk: 2108 MB + # vram_top: 98 MB + # context: 32768 tk + # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True) + + # model: unsloth/Qwen2.5-3B-Instruct + # tokens: 118 tk + # time: 33.748 s + # speed: 3.497 tk/s + # vram_bulk: 3310 MB + # vram_top: 60 MB + # context: 32768 tk + # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True) + + # Model size: 3.87B params + # vram used: xxxxx MB + # speed xxxxx t/s + # error: requires the protobuf library but it was not found in your environment + # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit") + + return model + + + diff --git a/requirements.txt b/requirements.txt index d301274..9621384 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ transformers accelerate -bitsandbytes \ No newline at end of file +bitsandbytes +pytest \ No newline at end of file