try out some more models

This commit is contained in:
2025-01-13 20:47:48 +01:00
parent 677eb6d0ea
commit 19870cdea8
5 changed files with 288 additions and 47 deletions

123
llama.py
View File

@@ -4,7 +4,7 @@ from tool_helper import tool_list, parse_and_execute_tool_call
from tool_functions import register_dummy
from inference import Inference, torch_reseed
import datetime
from modelconfig import Modelconfig
messages = []
@@ -12,6 +12,7 @@ inference = None
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user."
# system message for role flip so the model automatically answers for the user
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
@@ -23,7 +24,7 @@ summarize_user = {"role": "system", "content": "Can you summarize the conversati
# system message to create a conversation title
title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
append_toolcalls = True
append_toolcalls = False
register_dummy()
@@ -38,6 +39,7 @@ def append_generate_chat(input_text: str, role="user"):
messages.append({"role": role, "content": input_text})
inputs = inference.tokenize(messages, tokenize=True)
number_of_input_tokens = inputs.shape[1]
outputs, out_text = inference.generate(inputs)
@@ -45,7 +47,10 @@ def append_generate_chat(input_text: str, role="user"):
messages.append({"role": "assistant", "content": out_text})
print("")
print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0])))
time_taken = time.time() - t_start
number_of_tokens = len(outputs[0])
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
# handle tool call and check if a tool call has happened.
tool_result = parse_and_execute_tool_call(out_text, tool_list)
@@ -56,20 +61,10 @@ def append_generate_chat(input_text: str, role="user"):
append_generate_chat(tool_result, role="tool")
def main():
def terminal_generation_loop():
global messages
global inference
inference = Inference()
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
if append_toolcalls:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
else:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]
while True:
# print an input prompt to receive text or commands
input_text = input(">>> ")
@@ -173,6 +168,106 @@ def main():
append_generate_chat(input_text)
def main():
global messages
global inference
# model: NousResearch/Hermes-3-Llama-3.2-3B
# tokens: 315 tk
# time: 94.360 s
# speed: 3.338 tk/s
# vram_bulk: 3622 MB
# vram_top: 80 MB
# context: 131072 tk
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
# model: unsloth/Llama-3.2-1B
# tokens: 589 tk
# time: 39.348 s
# speed: 14.969 tk/s
# vram_bulk: 4708 MB
# vram_top: 102 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work.
# model: unsloth/Llama-3.2-3B-Instruct
# tokens: 285 tk
# time: 75.363 s
# speed: 3.782 tk/s
# vram_bulk: 3512 MB
# vram_top: 48 MB
# context: 131072 tk
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
# model: unsloth/llama-3-8b-bnb-4bit
# tokens: 435 tk
# time: 84.314 s
# speed: 5.159 tk/s
# vram_bulk: 5440 MB
# vram_top: 216 MB
# context: 8192 tk
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
# Model size: 3.21B params
# vram used: xxxxx MB
# speed xxxxx t/s
# working: DOES NOT LOAD
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
# model: unsloth/gemma-2-9b-it-bnb-4bit
# tokens: 154 tk
# time: 32.727 s
# speed: 4.706 tk/s
# vram_bulk: 6156 MB
# vram_top: 232 MB
# context: 8192 tk
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
# tokens: 120 tk
# time: 12.248 s
# speed: 9.798 tk/s
# vram_bulk: 5382 MB
# vram_top: 170 MB
# context: 32768 tk
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 112 tk
# time: 12.703 s
# speed: 8.816 tk/s
# vram_bulk: 2108 MB
# vram_top: 98 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
# model: unsloth/Qwen2.5-3B-Instruct
# tokens: 118 tk
# time: 33.748 s
# speed: 3.497 tk/s
# vram_bulk: 3310 MB
# vram_top: 60 MB
# context: 32768 tk
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
# Model size: 3.87B params
# vram used: xxxxx MB
# speed xxxxx t/s
# error: requires the protobuf library but it was not found in your environment
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
inference = Inference(model)
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
if append_toolcalls:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
else:
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]
terminal_generation_loop()
if __name__ == "__main__":
main()