Compare commits
3 Commits
adcb172da4
...
7f0cb49156
Author | SHA1 | Date |
---|---|---|
|
7f0cb49156 | 5 months ago |
|
19870cdea8 | 5 months ago |
|
677eb6d0ea | 5 months ago |
8 changed files with 485 additions and 194 deletions
@ -0,0 +1,37 @@ |
|||||
|
|
||||
|
|
||||
|
from inference import Inference |
||||
|
from modelconfig import Modelconfig |
||||
|
|
||||
|
|
||||
|
def main(): |
||||
|
# Model size: 3.21B params |
||||
|
Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)) |
||||
|
|
||||
|
# Model size: 1.24B params |
||||
|
Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True)) |
||||
|
|
||||
|
# Model size: 3.21B params |
||||
|
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)) |
||||
|
|
||||
|
# Model size: 4.65B params |
||||
|
Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 3.21B params |
||||
|
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 5.21B params |
||||
|
Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 4.46B params |
||||
|
Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 3.09B params |
||||
|
Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 3.87B params |
||||
|
Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
main() |
@ -0,0 +1,181 @@ |
|||||
|
import time |
||||
|
import json |
||||
|
import random |
||||
|
from tool_helper import tool_list, parse_and_execute_tool_call |
||||
|
from inference import Inference, torch_reseed |
||||
|
|
||||
|
|
||||
|
def check_append_file(prompt: str) -> str: |
||||
|
if prompt.startswith("@"): |
||||
|
prompt = prompt[1:] # Remove the '@' |
||||
|
filename = prompt.split(" ")[0] |
||||
|
try: |
||||
|
with open(filename, "r") as f: |
||||
|
content = f.read() |
||||
|
return "'''%s'''\n\n%s" % (content, prompt) |
||||
|
except: |
||||
|
print(f"File '{filename}' not found.") |
||||
|
return prompt |
||||
|
|
||||
|
|
||||
|
def msg(role: str, content: str) -> dict: |
||||
|
return {"role": role, "content": content} |
||||
|
|
||||
|
|
||||
|
class Terminal: |
||||
|
|
||||
|
def __init__(self, inference: Inference, systemmessage: dict): |
||||
|
self.inference = inference |
||||
|
self.messages:list[dict] = [systemmessage] |
||||
|
|
||||
|
# these are meant to be overwritten by better ones |
||||
|
self.roleflip = msg("system", "keep going.") |
||||
|
self.summarize = msg("system", "summarize conversation") |
||||
|
self.summarize_user = msg("system", "please summarize conversation") |
||||
|
self.title_prompt = msg("system", "create a title for this conversation") |
||||
|
|
||||
|
def append_generate_chat(self, input_text: str, role="user"): |
||||
|
t_start = time.time() |
||||
|
|
||||
|
# generate AI response |
||||
|
if input_text != None: |
||||
|
self.messages.append({"role": role, "content": input_text}) |
||||
|
|
||||
|
inputs = self.inference.tokenize(self.messages, tokenize=True) |
||||
|
number_of_input_tokens = inputs.shape[1] |
||||
|
|
||||
|
outputs, out_text = self.inference.generate(inputs) |
||||
|
|
||||
|
# append result to message history |
||||
|
self.messages.append({"role": "assistant", "content": out_text}) |
||||
|
|
||||
|
print("") |
||||
|
time_taken = time.time() - t_start |
||||
|
number_of_tokens = len(outputs[0]) |
||||
|
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken |
||||
|
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second)) |
||||
|
|
||||
|
# handle tool call and check if a tool call has happened. |
||||
|
tool_result = parse_and_execute_tool_call(out_text, tool_list) |
||||
|
if tool_result != None: |
||||
|
# tool call happened |
||||
|
tool_result = "<tool_response>%s</tool_response>" % tool_result |
||||
|
# depending on the chat template the tool response tags must or must not be passed. :( |
||||
|
self.append_generate_chat(tool_result, role="tool") |
||||
|
|
||||
|
def join(self): |
||||
|
|
||||
|
while True: |
||||
|
# print an input prompt to receive text or commands |
||||
|
input_text = input(">>> ") |
||||
|
print("") |
||||
|
|
||||
|
input_text = check_append_file(input_text) |
||||
|
|
||||
|
|
||||
|
if input_text.startswith("!"): |
||||
|
self.append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool") |
||||
|
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :( |
||||
|
|
||||
|
elif input_text.startswith("/clear"): |
||||
|
print("clearing chat history") |
||||
|
start_msg = self.messages[0] |
||||
|
self.message = [start_msg] |
||||
|
print("") |
||||
|
|
||||
|
elif input_text.startswith("/history"): |
||||
|
history = self.inference.tokenize(self.message, tokenize=False) |
||||
|
# history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False) |
||||
|
print(history) |
||||
|
|
||||
|
elif input_text.startswith("/undo"): |
||||
|
if len(self.message) > 2: |
||||
|
print("undo latest prompt") |
||||
|
self.message = self.message[:-2] |
||||
|
else: |
||||
|
print("cannot undo because there are not enough self.message on history.") |
||||
|
print("") |
||||
|
|
||||
|
elif input_text.startswith("/regen"): |
||||
|
if len(self.message) >= 2: |
||||
|
print("regenerating message (not working)") |
||||
|
self.message = self.message[:-1] |
||||
|
seed = random.randint(0, 2**32 - 1) # Generate a random seed |
||||
|
torch_reseed(seed) |
||||
|
self.append_generate_chat(None) |
||||
|
else: |
||||
|
print("cannot regenerate because there are not enough self.message on history.") |
||||
|
print("") |
||||
|
|
||||
|
elif input_text.startswith("/more"): |
||||
|
self.append_generate_chat(None) |
||||
|
|
||||
|
elif input_text.startswith("/file"): |
||||
|
filename = input_text[len("/file "):] |
||||
|
print("read '%s' for prompt:" % filename) |
||||
|
with open(filename, "r") as f: |
||||
|
content = f.read() |
||||
|
print(content) |
||||
|
self.append_generate_chat(content) |
||||
|
|
||||
|
elif input_text.startswith("/auto"): |
||||
|
message_backup = self.message |
||||
|
self.message = [self.roleflip] |
||||
|
for m in self.message_backup: |
||||
|
role = m["role"] |
||||
|
content = m["content"] |
||||
|
if role == "user": |
||||
|
role = "assistant" |
||||
|
elif role == "assistant": |
||||
|
role = "user" |
||||
|
if role != "system": |
||||
|
self.message.append({"role": role, "content": content}) |
||||
|
self.append_generate_chat(None) # will automatically advance the conversation as 'user' |
||||
|
last_message = self.messages[-1] |
||||
|
last_message["role"] = "user" |
||||
|
self.messages = message_backup + [last_message] |
||||
|
self.append_generate_chat(None) # 'regular' chatbot answer |
||||
|
|
||||
|
elif input_text.startswith("/summarize"): |
||||
|
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages)) |
||||
|
messages_temp = [self.summarize] + messages_temp + [self.summarize_user] # copy dict in last instance |
||||
|
# messages_temp[-1]["role"] = "user" |
||||
|
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ") |
||||
|
generated_tokens, full_output = self.inference.generate(input_ids) |
||||
|
|
||||
|
elif input_text.startswith("/title"): |
||||
|
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages)) |
||||
|
messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance |
||||
|
messages_temp[-1]["role"] = "user" |
||||
|
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ") |
||||
|
generated_tokens, full_output = self.inference.generate(input_ids) |
||||
|
|
||||
|
elif input_text.startswith("/save"): |
||||
|
with open("messages.json", "w") as f: |
||||
|
json.dump(self.messages, f, indent=4) |
||||
|
|
||||
|
elif input_text.startswith("/load"): |
||||
|
with open("messages.json", "r") as f: |
||||
|
new_messages = json.load(f) |
||||
|
messages = [self.messages[0]] + new_messages[1:] |
||||
|
|
||||
|
elif input_text.startswith("/help"): |
||||
|
print("!<prompt> answer as 'tool' in <tool_response> tags") |
||||
|
print("/clear clear chat history") |
||||
|
print("/undo undo latest prompt") |
||||
|
print("/regen regenerate the last message") |
||||
|
print("/more generate more additional information") |
||||
|
print("/file read prompt input from file") |
||||
|
print("/auto automatically advance conversation") |
||||
|
print("/summarize generate a summary of the chat") |
||||
|
print("/title generate a title of the chat") |
||||
|
print("/save write chat history to file") |
||||
|
print("/load load previously saved history") |
||||
|
print("/help print this message") |
||||
|
print("") |
||||
|
|
||||
|
elif input_text.startswith("/"): |
||||
|
print("unknown command.") |
||||
|
|
||||
|
else: |
||||
|
self.append_generate_chat(input_text) |
@ -0,0 +1,76 @@ |
|||||
|
from inference import Inference |
||||
|
from modelconfig import Modelconfig |
||||
|
import time |
||||
|
import nvidia_smi |
||||
|
import torch |
||||
|
import gc |
||||
|
|
||||
|
|
||||
|
def empty_cuda(): |
||||
|
while True: |
||||
|
gc.collect() |
||||
|
torch.cuda.empty_cache() |
||||
|
time.sleep(0.5) |
||||
|
vram = nvidia_smi.get_gpu_stats()["memory_used"] |
||||
|
print("vram: %d MB" % vram) |
||||
|
if vram < 200: |
||||
|
return |
||||
|
|
||||
|
|
||||
|
def profile_ex(model_conf: Modelconfig): |
||||
|
print("") |
||||
|
empty_cuda() |
||||
|
messages = [ |
||||
|
{"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."}, |
||||
|
{"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"}, |
||||
|
] |
||||
|
|
||||
|
gpu_stats_before = nvidia_smi.get_gpu_stats() |
||||
|
inference = Inference(model_conf) |
||||
|
|
||||
|
gpu_stats_loaded = nvidia_smi.get_gpu_stats() |
||||
|
t_start = time.time() |
||||
|
input_ids = inference.tokenize(messages, tokenize=True) |
||||
|
generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False) |
||||
|
t_end = time.time() |
||||
|
gpu_stats_after = nvidia_smi.get_gpu_stats() |
||||
|
|
||||
|
took = t_end - t_start |
||||
|
tokens = len(generated_tokens[0]) |
||||
|
tokens_per = tokens / took |
||||
|
vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"] |
||||
|
vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"] |
||||
|
print("model: %s" % model_conf.model_name) |
||||
|
print("tokens: %d tk" % tokens) |
||||
|
print("time: %.3f s" % took) |
||||
|
print("speed: %.3f tk/s" % tokens_per) |
||||
|
print("vram_bulk: %d MB" % vram_bulk) |
||||
|
print("vram_top: %d MB" % vram_top) |
||||
|
print("context: %d tk" % inference.max_context_length) |
||||
|
print("") |
||||
|
|
||||
|
|
||||
|
def profile(model_conf): |
||||
|
try: |
||||
|
profile_ex(model_conf) |
||||
|
except Exception as e: |
||||
|
print("exception: " + str(e)) |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
def main(): |
||||
|
profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/Llama-3.2-1B")) |
||||
|
profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit")) |
||||
|
# profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")) |
||||
|
profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")) |
||||
|
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)) |
||||
|
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")) |
||||
|
|
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
main() |
@ -1,175 +1,43 @@ |
|||||
import time |
|
||||
import random |
from tool_helper import tool_list |
||||
from tool_helper import tool_list, parse_and_execute_tool_call |
|
||||
from tool_functions import register_dummy |
from tool_functions import register_dummy |
||||
from inference import Inference, torch_reseed |
from inference import Inference |
||||
import datetime |
import datetime |
||||
|
import model_selection |
||||
|
from generation_loop import Terminal, msg |
||||
|
|
||||
messages = [] |
|
||||
inference = None |
|
||||
|
|
||||
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions |
|
||||
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user." |
|
||||
|
|
||||
# system message for role flip so the model automatically answers for the user |
|
||||
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."} |
|
||||
|
|
||||
# system messages and user message to bring the model to summarize the entire conversation |
|
||||
summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."} |
|
||||
summarize_user = {"role": "system", "content": "Can you summarize the conversation?"} |
|
||||
|
|
||||
# system message to create a conversation title |
|
||||
title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."} |
|
||||
|
|
||||
|
|
||||
|
|
||||
register_dummy() |
register_dummy() |
||||
|
|
||||
|
|
||||
|
def initialize_config(inference: Inference) -> Terminal: |
||||
|
|
||||
|
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions |
||||
def append_generate_chat(input_text: str, role="user"): |
system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information." |
||||
t_start = time.time() |
|
||||
|
|
||||
# generate AI response |
|
||||
if input_text != None: |
|
||||
messages.append({"role": role, "content": input_text}) |
|
||||
|
|
||||
inputs = inference.tokenize(messages, tokenize=True) |
|
||||
|
|
||||
outputs, out_text = inference.generate(inputs) |
|
||||
|
|
||||
# append result to message history |
|
||||
messages.append({"role": "assistant", "content": out_text}) |
|
||||
|
|
||||
print("") |
|
||||
print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0]))) |
|
||||
|
|
||||
# handle tool call and check if a tool call has happened. |
|
||||
tool_result = parse_and_execute_tool_call(out_text, tool_list) |
|
||||
if tool_result != None: |
|
||||
# tool call happened |
|
||||
tool_result = "<tool_response>%s</tool_response>" % tool_result |
|
||||
# depending on the chat template the tool response tags must or must not be passed. :( |
|
||||
append_generate_chat(tool_result, role="tool") |
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
def main(): |
|
||||
global messages |
|
||||
global inference |
|
||||
|
|
||||
inference = Inference() |
|
||||
|
|
||||
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.") |
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.") |
||||
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}] |
append_toolcalls = False |
||||
|
if append_toolcalls: |
||||
while True: |
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)) |
||||
# print an input prompt to receive text or commands |
|
||||
input_text = input(">>> ") |
|
||||
print("") |
|
||||
|
|
||||
|
|
||||
if input_text.startswith("!"): |
|
||||
append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool") |
|
||||
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :( |
|
||||
|
|
||||
elif input_text.startswith("/clear"): |
|
||||
print("clearing chat history") |
|
||||
start_msg = messages[0] |
|
||||
messages = [start_msg] |
|
||||
print("") |
|
||||
|
|
||||
elif input_text.startswith("/history"): |
|
||||
history = inference.tokenize(messages, tokenize=False) |
|
||||
# history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False) |
|
||||
print(history) |
|
||||
|
|
||||
elif input_text.startswith("/undo"): |
|
||||
if len(messages) > 2: |
|
||||
print("undo latest prompt") |
|
||||
messages = messages[:-2] |
|
||||
else: |
|
||||
print("cannot undo because there are not enough messages on history.") |
|
||||
print("") |
|
||||
|
|
||||
elif input_text.startswith("/regen"): |
|
||||
if len(messages) >= 2: |
|
||||
print("regenerating message (not working)") |
|
||||
messages = messages[:-1] |
|
||||
seed = random.randint(0, 2**32 - 1) # Generate a random seed |
|
||||
torch_reseed(seed) |
|
||||
append_generate_chat(None) |
|
||||
else: |
else: |
||||
print("cannot regenerate because there are not enough messages on history.") |
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time) |
||||
print("") |
|
||||
|
|
||||
elif input_text.startswith("/more"): |
terminal = Terminal(inference, systemmessage) |
||||
append_generate_chat(None) |
|
||||
|
|
||||
elif input_text.startswith("/file"): |
# system message for role flip so the model automatically answers for the user |
||||
filename = input_text[len("/file "):] |
terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.") |
||||
print("read '%s' for prompt:" % filename) |
|
||||
with open(filename, "r") as f: |
|
||||
content = f.read() |
|
||||
print(content) |
|
||||
append_generate_chat(content) |
|
||||
|
|
||||
elif input_text.startswith("/auto"): |
# system messages and user message to bring the model to summarize the entire conversation |
||||
messages_backup = messages |
terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.") |
||||
messages = [roleflip] |
terminal.summarize_user = msg("system", "Can you summarize the conversation?") |
||||
for m in messages_backup: |
|
||||
role = m["role"] |
|
||||
content = m["content"] |
|
||||
if role == "user": |
|
||||
role = "assistant" |
|
||||
elif role == "assistant": |
|
||||
role = "user" |
|
||||
if role != "system": |
|
||||
messages.append({"role": role, "content": content}) |
|
||||
append_generate_chat(None) # will automatically advance the conversation as 'user' |
|
||||
last_message = messages[-1] |
|
||||
last_message["role"] = "user" |
|
||||
messages = messages_backup + [last_message] |
|
||||
append_generate_chat(None) # 'regular' chatbot answer |
|
||||
|
|
||||
elif input_text.startswith("/summarize"): |
# system message to create a conversation title |
||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages)) |
terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.") |
||||
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance |
return terminal |
||||
# messages_temp[-1]["role"] = "user" |
|
||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ") |
|
||||
generated_tokens, full_output = inference.generate(input_ids) |
|
||||
|
|
||||
elif input_text.startswith("/title"): |
|
||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages)) |
|
||||
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance |
|
||||
messages_temp[-1]["role"] = "user" |
|
||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ") |
|
||||
generated_tokens, full_output = inference.generate(input_ids) |
|
||||
|
|
||||
elif input_text.startswith("/help"): |
|
||||
print("!<prompt> answer as 'tool' in <tool_response> tags") |
|
||||
print("/clear clear chat history") |
|
||||
print("/undo undo latest prompt") |
|
||||
print("/regen regenerate the last message") |
|
||||
print("/more generate more additional information") |
|
||||
print("/file read prompt input from file") |
|
||||
print("/auto automatically advance conversation") |
|
||||
print("/summarize generate a summary of the chat") |
|
||||
print("/title generate a title of the chat") |
|
||||
print("/help print this message") |
|
||||
print("") |
|
||||
|
|
||||
elif input_text.startswith("/"): |
|
||||
print("unknown command.") |
|
||||
|
|
||||
else: |
|
||||
append_generate_chat(input_text) |
|
||||
|
|
||||
|
|
||||
|
|
||||
if __name__ == "__main__": |
if __name__ == "__main__": |
||||
main() |
|
||||
|
inference = Inference(model_selection.get_model()) |
||||
|
terminal = initialize_config(inference) |
||||
|
terminal.join() |
@ -0,0 +1,95 @@ |
|||||
|
|
||||
|
from modelconfig import Modelconfig |
||||
|
|
||||
|
|
||||
|
|
||||
|
def get_model() -> Modelconfig: |
||||
|
|
||||
|
# model: NousResearch/Hermes-3-Llama-3.2-3B |
||||
|
# tokens: 315 tk |
||||
|
# time: 94.360 s |
||||
|
# speed: 3.338 tk/s |
||||
|
# vram_bulk: 3622 MB |
||||
|
# vram_top: 80 MB |
||||
|
# context: 131072 tk |
||||
|
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True) |
||||
|
|
||||
|
# model: unsloth/Llama-3.2-1B |
||||
|
# tokens: 589 tk |
||||
|
# time: 39.348 s |
||||
|
# speed: 14.969 tk/s |
||||
|
# vram_bulk: 4708 MB |
||||
|
# vram_top: 102 MB |
||||
|
# context: 131072 tk |
||||
|
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work. |
||||
|
|
||||
|
# model: unsloth/Llama-3.2-3B-Instruct |
||||
|
# tokens: 285 tk |
||||
|
# time: 75.363 s |
||||
|
# speed: 3.782 tk/s |
||||
|
# vram_bulk: 3512 MB |
||||
|
# vram_top: 48 MB |
||||
|
# context: 131072 tk |
||||
|
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True) |
||||
|
|
||||
|
# model: unsloth/llama-3-8b-bnb-4bit |
||||
|
# tokens: 435 tk |
||||
|
# time: 84.314 s |
||||
|
# speed: 5.159 tk/s |
||||
|
# vram_bulk: 5440 MB |
||||
|
# vram_top: 216 MB |
||||
|
# context: 8192 tk |
||||
|
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit") |
||||
|
|
||||
|
# Model size: 3.21B params |
||||
|
# vram used: xxxxx MB |
||||
|
# speed xxxxx t/s |
||||
|
# working: DOES NOT LOAD |
||||
|
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True) |
||||
|
|
||||
|
# model: unsloth/gemma-2-9b-it-bnb-4bit |
||||
|
# tokens: 154 tk |
||||
|
# time: 32.727 s |
||||
|
# speed: 4.706 tk/s |
||||
|
# vram_bulk: 6156 MB |
||||
|
# vram_top: 232 MB |
||||
|
# context: 8192 tk |
||||
|
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit") |
||||
|
|
||||
|
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit |
||||
|
# tokens: 120 tk |
||||
|
# time: 12.248 s |
||||
|
# speed: 9.798 tk/s |
||||
|
# vram_bulk: 5382 MB |
||||
|
# vram_top: 170 MB |
||||
|
# context: 32768 tk |
||||
|
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good |
||||
|
|
||||
|
# model: unsloth/Qwen2.5-3B-Instruct |
||||
|
# tokens: 112 tk |
||||
|
# time: 12.703 s |
||||
|
# speed: 8.816 tk/s |
||||
|
# vram_bulk: 2108 MB |
||||
|
# vram_top: 98 MB |
||||
|
# context: 32768 tk |
||||
|
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True) |
||||
|
|
||||
|
# model: unsloth/Qwen2.5-3B-Instruct |
||||
|
# tokens: 118 tk |
||||
|
# time: 33.748 s |
||||
|
# speed: 3.497 tk/s |
||||
|
# vram_bulk: 3310 MB |
||||
|
# vram_top: 60 MB |
||||
|
# context: 32768 tk |
||||
|
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True) |
||||
|
|
||||
|
# Model size: 3.87B params |
||||
|
# vram used: xxxxx MB |
||||
|
# speed xxxxx t/s |
||||
|
# error: requires the protobuf library but it was not found in your environment |
||||
|
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit") |
||||
|
|
||||
|
return model |
||||
|
|
||||
|
|
||||
|
|
@ -0,0 +1,20 @@ |
|||||
|
|
||||
|
from transformers import BitsAndBytesConfig |
||||
|
import torch |
||||
|
|
||||
|
class Modelconfig: |
||||
|
def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False): |
||||
|
self.model_name = model_name |
||||
|
if load_in_4bit: |
||||
|
assert bits_and_bytes_config == None |
||||
|
self.bits_and_bytes_config = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode |
||||
|
load_in_4bit=True, |
||||
|
bnb_4bit_quant_type="nf4", # Recommended for better performance |
||||
|
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving |
||||
|
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation |
||||
|
) |
||||
|
elif load_in_8bit: |
||||
|
assert bits_and_bytes_config == None |
||||
|
self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True) |
||||
|
else: |
||||
|
self.bits_and_bytes_config = bits_and_bytes_config |
@ -1,3 +1,4 @@ |
|||||
transformers |
transformers |
||||
accelerate |
accelerate |
||||
bitsandbytes |
bitsandbytes |
||||
|
pytest |
Loading…
Reference in new issue