Browse Source

refactoring

master
Florin Tobler 5 months ago
parent
commit
7f0cb49156
  1. 181
      generation_loop.py
  2. 284
      llama.py
  3. 95
      model_selection.py
  4. 3
      requirements.txt

181
generation_loop.py

@ -0,0 +1,181 @@
import time
import json
import random
from tool_helper import tool_list, parse_and_execute_tool_call
from inference import Inference, torch_reseed
def check_append_file(prompt: str) -> str:
if prompt.startswith("@"):
prompt = prompt[1:] # Remove the '@'
filename = prompt.split(" ")[0]
try:
with open(filename, "r") as f:
content = f.read()
return "'''%s'''\n\n%s" % (content, prompt)
except:
print(f"File '{filename}' not found.")
return prompt
def msg(role: str, content: str) -> dict:
return {"role": role, "content": content}
class Terminal:
def __init__(self, inference: Inference, systemmessage: dict):
self.inference = inference
self.messages:list[dict] = [systemmessage]
# these are meant to be overwritten by better ones
self.roleflip = msg("system", "keep going.")
self.summarize = msg("system", "summarize conversation")
self.summarize_user = msg("system", "please summarize conversation")
self.title_prompt = msg("system", "create a title for this conversation")
def append_generate_chat(self, input_text: str, role="user"):
t_start = time.time()
# generate AI response
if input_text != None:
self.messages.append({"role": role, "content": input_text})
inputs = self.inference.tokenize(self.messages, tokenize=True)
number_of_input_tokens = inputs.shape[1]
outputs, out_text = self.inference.generate(inputs)
# append result to message history
self.messages.append({"role": "assistant", "content": out_text})
print("")
time_taken = time.time() - t_start
number_of_tokens = len(outputs[0])
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
# handle tool call and check if a tool call has happened.
tool_result = parse_and_execute_tool_call(out_text, tool_list)
if tool_result != None:
# tool call happened
tool_result = "<tool_response>%s</tool_response>" % tool_result
# depending on the chat template the tool response tags must or must not be passed. :(
self.append_generate_chat(tool_result, role="tool")
def join(self):
while True:
# print an input prompt to receive text or commands
input_text = input(">>> ")
print("")
input_text = check_append_file(input_text)
if input_text.startswith("!"):
self.append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :(
elif input_text.startswith("/clear"):
print("clearing chat history")
start_msg = self.messages[0]
self.message = [start_msg]
print("")
elif input_text.startswith("/history"):
history = self.inference.tokenize(self.message, tokenize=False)
# history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False)
print(history)
elif input_text.startswith("/undo"):
if len(self.message) > 2:
print("undo latest prompt")
self.message = self.message[:-2]
else:
print("cannot undo because there are not enough self.message on history.")
print("")
elif input_text.startswith("/regen"):
if len(self.message) >= 2:
print("regenerating message (not working)")
self.message = self.message[:-1]
seed = random.randint(0, 2**32 - 1) # Generate a random seed
torch_reseed(seed)
self.append_generate_chat(None)
else:
print("cannot regenerate because there are not enough self.message on history.")
print("")
elif input_text.startswith("/more"):
self.append_generate_chat(None)
elif input_text.startswith("/file"):
filename = input_text[len("/file "):]
print("read '%s' for prompt:" % filename)
with open(filename, "r") as f:
content = f.read()
print(content)
self.append_generate_chat(content)
elif input_text.startswith("/auto"):
message_backup = self.message
self.message = [self.roleflip]
for m in self.message_backup:
role = m["role"]
content = m["content"]
if role == "user":
role = "assistant"
elif role == "assistant":
role = "user"
if role != "system":
self.message.append({"role": role, "content": content})
self.append_generate_chat(None) # will automatically advance the conversation as 'user'
last_message = self.messages[-1]
last_message["role"] = "user"
self.messages = message_backup + [last_message]
self.append_generate_chat(None) # 'regular' chatbot answer
elif input_text.startswith("/summarize"):
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
messages_temp = [self.summarize] + messages_temp + [self.summarize_user] # copy dict in last instance
# messages_temp[-1]["role"] = "user"
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
generated_tokens, full_output = self.inference.generate(input_ids)
elif input_text.startswith("/title"):
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
messages_temp[-1]["role"] = "user"
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
generated_tokens, full_output = self.inference.generate(input_ids)
elif input_text.startswith("/save"):
with open("messages.json", "w") as f:
json.dump(self.messages, f, indent=4)
elif input_text.startswith("/load"):
with open("messages.json", "r") as f:
new_messages = json.load(f)
messages = [self.messages[0]] + new_messages[1:]
elif input_text.startswith("/help"):
print("!<prompt> answer as 'tool' in <tool_response> tags")
print("/clear clear chat history")
print("/undo undo latest prompt")
print("/regen regenerate the last message")
print("/more generate more additional information")
print("/file read prompt input from file")
print("/auto automatically advance conversation")
print("/summarize generate a summary of the chat")
print("/title generate a title of the chat")
print("/save write chat history to file")
print("/load load previously saved history")
print("/help print this message")
print("")
elif input_text.startswith("/"):
print("unknown command.")
else:
self.append_generate_chat(input_text)

284
llama.py

@ -1,273 +1,43 @@
import time
import random
from tool_helper import tool_list, parse_and_execute_tool_call
from tool_helper import tool_list
from tool_functions import register_dummy
from inference import Inference, torch_reseed
from inference import Inference
import datetime
from modelconfig import Modelconfig
messages = []
inference = None
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user."
# system message for role flip so the model automatically answers for the user
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
# system messages and user message to bring the model to summarize the entire conversation
summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."}
summarize_user = {"role": "system", "content": "Can you summarize the conversation?"}
import model_selection
from generation_loop import Terminal, msg
# system message to create a conversation title
title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
append_toolcalls = False
register_dummy()
def initialize_config(inference: Inference) -> Terminal:
def append_generate_chat(input_text: str, role="user"):
t_start = time.time()
# generate AI response
if input_text != None:
messages.append({"role": role, "content": input_text})
inputs = inference.tokenize(messages, tokenize=True)
number_of_input_tokens = inputs.shape[1]
outputs, out_text = inference.generate(inputs)
# append result to message history
messages.append({"role": "assistant", "content": out_text})
print("")
time_taken = time.time() - t_start
number_of_tokens = len(outputs[0])
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
# handle tool call and check if a tool call has happened.
tool_result = parse_and_execute_tool_call(out_text, tool_list)
if tool_result != None:
# tool call happened
tool_result = "<tool_response>%s</tool_response>" % tool_result
# depending on the chat template the tool response tags must or must not be passed. :(
append_generate_chat(tool_result, role="tool")
def terminal_generation_loop():
global messages
global inference
while True:
# print an input prompt to receive text or commands
input_text = input(">>> ")
print("")
if input_text.startswith("!"):
append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :(
elif input_text.startswith("/clear"):
print("clearing chat history")
start_msg = messages[0]
messages = [start_msg]
print("")
elif input_text.startswith("/history"):
history = inference.tokenize(messages, tokenize=False)
# history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
print(history)
elif input_text.startswith("/undo"):
if len(messages) > 2:
print("undo latest prompt")
messages = messages[:-2]
else:
print("cannot undo because there are not enough messages on history.")
print("")
elif input_text.startswith("/regen"):
if len(messages) >= 2:
print("regenerating message (not working)")
messages = messages[:-1]
seed = random.randint(0, 2**32 - 1) # Generate a random seed
torch_reseed(seed)
append_generate_chat(None)
else:
print("cannot regenerate because there are not enough messages on history.")
print("")
elif input_text.startswith("/more"):
append_generate_chat(None)
elif input_text.startswith("/file"):
filename = input_text[len("/file "):]
print("read '%s' for prompt:" % filename)
with open(filename, "r") as f:
content = f.read()
print(content)
append_generate_chat(content)
elif input_text.startswith("/auto"):
messages_backup = messages
messages = [roleflip]
for m in messages_backup:
role = m["role"]
content = m["content"]
if role == "user":
role = "assistant"
elif role == "assistant":
role = "user"
if role != "system":
messages.append({"role": role, "content": content})
append_generate_chat(None) # will automatically advance the conversation as 'user'
last_message = messages[-1]
last_message["role"] = "user"
messages = messages_backup + [last_message]
append_generate_chat(None) # 'regular' chatbot answer
elif input_text.startswith("/summarize"):
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance
# messages_temp[-1]["role"] = "user"
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
generated_tokens, full_output = inference.generate(input_ids)
elif input_text.startswith("/title"):
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
messages_temp[-1]["role"] = "user"
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
generated_tokens, full_output = inference.generate(input_ids)
elif input_text.startswith("/help"):
print("!<prompt> answer as 'tool' in <tool_response> tags")
print("/clear clear chat history")
print("/undo undo latest prompt")
print("/regen regenerate the last message")
print("/more generate more additional information")
print("/file read prompt input from file")
print("/auto automatically advance conversation")
print("/summarize generate a summary of the chat")
print("/title generate a title of the chat")
print("/help print this message")
print("")
elif input_text.startswith("/"):
print("unknown command.")
else:
append_generate_chat(input_text)
def main():
global messages
global inference
# model: NousResearch/Hermes-3-Llama-3.2-3B
# tokens: 315 tk
# time: 94.360 s
# speed: 3.338 tk/s
# vram_bulk: 3622 MB
# vram_top: 80 MB
# context: 131072 tk
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
# model: unsloth/Llama-3.2-1B
# tokens: 589 tk
# time: 39.348 s
# speed: 14.969 tk/s
# vram_bulk: 4708 MB
# vram_top: 102 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work.
# model: unsloth/Llama-3.2-3B-Instruct
# tokens: 285 tk
# time: 75.363 s
# speed: 3.782 tk/s
# vram_bulk: 3512 MB
# vram_top: 48 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
# model: unsloth/llama-3-8b-bnb-4bit
# tokens: 435 tk
# time: 84.314 s
# speed: 5.159 tk/s
# vram_bulk: 5440 MB
# vram_top: 216 MB
# context: 8192 tk
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
# Model size: 3.21B params
# vram used: xxxxx MB
# speed xxxxx t/s
# working: DOES NOT LOAD
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
# model: unsloth/gemma-2-9b-it-bnb-4bit
# tokens: 154 tk
# time: 32.727 s
# speed: 4.706 tk/s
# vram_bulk: 6156 MB
# vram_top: 232 MB
# context: 8192 tk
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
# tokens: 120 tk
# time: 12.248 s
# speed: 9.798 tk/s
# vram_bulk: 5382 MB
# vram_top: 170 MB
# context: 32768 tk
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 112 tk
# time: 12.703 s
# speed: 8.816 tk/s
# vram_bulk: 2108 MB
# vram_top: 98 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 118 tk
# time: 33.748 s
# speed: 3.497 tk/s
# vram_bulk: 3310 MB
# vram_top: 60 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
# Model size: 3.87B params
# vram used: xxxxx MB
# speed xxxxx t/s
# error: requires the protobuf library but it was not found in your environment
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
inference = Inference(model)
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information."
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
append_toolcalls = False
if append_toolcalls:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list))
else:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time)
terminal = Terminal(inference, systemmessage)
terminal_generation_loop()
# system message for role flip so the model automatically answers for the user
terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.")
# system messages and user message to bring the model to summarize the entire conversation
terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.")
terminal.summarize_user = msg("system", "Can you summarize the conversation?")
# system message to create a conversation title
terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.")
return terminal
if __name__ == "__main__":
main()
inference = Inference(model_selection.get_model())
terminal = initialize_config(inference)
terminal.join()

95
model_selection.py

@ -0,0 +1,95 @@
from modelconfig import Modelconfig
def get_model() -> Modelconfig:
# model: NousResearch/Hermes-3-Llama-3.2-3B
# tokens: 315 tk
# time: 94.360 s
# speed: 3.338 tk/s
# vram_bulk: 3622 MB
# vram_top: 80 MB
# context: 131072 tk
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
# model: unsloth/Llama-3.2-1B
# tokens: 589 tk
# time: 39.348 s
# speed: 14.969 tk/s
# vram_bulk: 4708 MB
# vram_top: 102 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work.
# model: unsloth/Llama-3.2-3B-Instruct
# tokens: 285 tk
# time: 75.363 s
# speed: 3.782 tk/s
# vram_bulk: 3512 MB
# vram_top: 48 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
# model: unsloth/llama-3-8b-bnb-4bit
# tokens: 435 tk
# time: 84.314 s
# speed: 5.159 tk/s
# vram_bulk: 5440 MB
# vram_top: 216 MB
# context: 8192 tk
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
# Model size: 3.21B params
# vram used: xxxxx MB
# speed xxxxx t/s
# working: DOES NOT LOAD
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
# model: unsloth/gemma-2-9b-it-bnb-4bit
# tokens: 154 tk
# time: 32.727 s
# speed: 4.706 tk/s
# vram_bulk: 6156 MB
# vram_top: 232 MB
# context: 8192 tk
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
# tokens: 120 tk
# time: 12.248 s
# speed: 9.798 tk/s
# vram_bulk: 5382 MB
# vram_top: 170 MB
# context: 32768 tk
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 112 tk
# time: 12.703 s
# speed: 8.816 tk/s
# vram_bulk: 2108 MB
# vram_top: 98 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 118 tk
# time: 33.748 s
# speed: 3.497 tk/s
# vram_bulk: 3310 MB
# vram_top: 60 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
# Model size: 3.87B params
# vram used: xxxxx MB
# speed xxxxx t/s
# error: requires the protobuf library but it was not found in your environment
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
return model

3
requirements.txt

@ -1,3 +1,4 @@
transformers
accelerate
bitsandbytes
bitsandbytes
pytest
Loading…
Cancel
Save