Compare commits
3 Commits
adcb172da4
...
7f0cb49156
Author | SHA1 | Date | |
---|---|---|---|
7f0cb49156 | |||
19870cdea8 | |||
677eb6d0ea |
37
download_model.py
Normal file
37
download_model.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
|
||||||
|
|
||||||
|
from inference import Inference
|
||||||
|
from modelconfig import Modelconfig
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Model size: 3.21B params
|
||||||
|
Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
|
||||||
|
|
||||||
|
# Model size: 1.24B params
|
||||||
|
Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True))
|
||||||
|
|
||||||
|
# Model size: 3.21B params
|
||||||
|
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
|
||||||
|
|
||||||
|
# Model size: 4.65B params
|
||||||
|
Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True))
|
||||||
|
|
||||||
|
# Model size: 3.21B params
|
||||||
|
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True))
|
||||||
|
|
||||||
|
# Model size: 5.21B params
|
||||||
|
Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True))
|
||||||
|
|
||||||
|
# Model size: 4.46B params
|
||||||
|
Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True))
|
||||||
|
|
||||||
|
# Model size: 3.09B params
|
||||||
|
Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
|
||||||
|
|
||||||
|
# Model size: 3.87B params
|
||||||
|
Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
181
generation_loop.py
Normal file
181
generation_loop.py
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
import time
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
from tool_helper import tool_list, parse_and_execute_tool_call
|
||||||
|
from inference import Inference, torch_reseed
|
||||||
|
|
||||||
|
|
||||||
|
def check_append_file(prompt: str) -> str:
|
||||||
|
if prompt.startswith("@"):
|
||||||
|
prompt = prompt[1:] # Remove the '@'
|
||||||
|
filename = prompt.split(" ")[0]
|
||||||
|
try:
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
content = f.read()
|
||||||
|
return "'''%s'''\n\n%s" % (content, prompt)
|
||||||
|
except:
|
||||||
|
print(f"File '{filename}' not found.")
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def msg(role: str, content: str) -> dict:
|
||||||
|
return {"role": role, "content": content}
|
||||||
|
|
||||||
|
|
||||||
|
class Terminal:
|
||||||
|
|
||||||
|
def __init__(self, inference: Inference, systemmessage: dict):
|
||||||
|
self.inference = inference
|
||||||
|
self.messages:list[dict] = [systemmessage]
|
||||||
|
|
||||||
|
# these are meant to be overwritten by better ones
|
||||||
|
self.roleflip = msg("system", "keep going.")
|
||||||
|
self.summarize = msg("system", "summarize conversation")
|
||||||
|
self.summarize_user = msg("system", "please summarize conversation")
|
||||||
|
self.title_prompt = msg("system", "create a title for this conversation")
|
||||||
|
|
||||||
|
def append_generate_chat(self, input_text: str, role="user"):
|
||||||
|
t_start = time.time()
|
||||||
|
|
||||||
|
# generate AI response
|
||||||
|
if input_text != None:
|
||||||
|
self.messages.append({"role": role, "content": input_text})
|
||||||
|
|
||||||
|
inputs = self.inference.tokenize(self.messages, tokenize=True)
|
||||||
|
number_of_input_tokens = inputs.shape[1]
|
||||||
|
|
||||||
|
outputs, out_text = self.inference.generate(inputs)
|
||||||
|
|
||||||
|
# append result to message history
|
||||||
|
self.messages.append({"role": "assistant", "content": out_text})
|
||||||
|
|
||||||
|
print("")
|
||||||
|
time_taken = time.time() - t_start
|
||||||
|
number_of_tokens = len(outputs[0])
|
||||||
|
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
|
||||||
|
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
|
||||||
|
|
||||||
|
# handle tool call and check if a tool call has happened.
|
||||||
|
tool_result = parse_and_execute_tool_call(out_text, tool_list)
|
||||||
|
if tool_result != None:
|
||||||
|
# tool call happened
|
||||||
|
tool_result = "<tool_response>%s</tool_response>" % tool_result
|
||||||
|
# depending on the chat template the tool response tags must or must not be passed. :(
|
||||||
|
self.append_generate_chat(tool_result, role="tool")
|
||||||
|
|
||||||
|
def join(self):
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# print an input prompt to receive text or commands
|
||||||
|
input_text = input(">>> ")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
input_text = check_append_file(input_text)
|
||||||
|
|
||||||
|
|
||||||
|
if input_text.startswith("!"):
|
||||||
|
self.append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
|
||||||
|
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :(
|
||||||
|
|
||||||
|
elif input_text.startswith("/clear"):
|
||||||
|
print("clearing chat history")
|
||||||
|
start_msg = self.messages[0]
|
||||||
|
self.message = [start_msg]
|
||||||
|
print("")
|
||||||
|
|
||||||
|
elif input_text.startswith("/history"):
|
||||||
|
history = self.inference.tokenize(self.message, tokenize=False)
|
||||||
|
# history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
||||||
|
print(history)
|
||||||
|
|
||||||
|
elif input_text.startswith("/undo"):
|
||||||
|
if len(self.message) > 2:
|
||||||
|
print("undo latest prompt")
|
||||||
|
self.message = self.message[:-2]
|
||||||
|
else:
|
||||||
|
print("cannot undo because there are not enough self.message on history.")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
elif input_text.startswith("/regen"):
|
||||||
|
if len(self.message) >= 2:
|
||||||
|
print("regenerating message (not working)")
|
||||||
|
self.message = self.message[:-1]
|
||||||
|
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
||||||
|
torch_reseed(seed)
|
||||||
|
self.append_generate_chat(None)
|
||||||
|
else:
|
||||||
|
print("cannot regenerate because there are not enough self.message on history.")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
elif input_text.startswith("/more"):
|
||||||
|
self.append_generate_chat(None)
|
||||||
|
|
||||||
|
elif input_text.startswith("/file"):
|
||||||
|
filename = input_text[len("/file "):]
|
||||||
|
print("read '%s' for prompt:" % filename)
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
content = f.read()
|
||||||
|
print(content)
|
||||||
|
self.append_generate_chat(content)
|
||||||
|
|
||||||
|
elif input_text.startswith("/auto"):
|
||||||
|
message_backup = self.message
|
||||||
|
self.message = [self.roleflip]
|
||||||
|
for m in self.message_backup:
|
||||||
|
role = m["role"]
|
||||||
|
content = m["content"]
|
||||||
|
if role == "user":
|
||||||
|
role = "assistant"
|
||||||
|
elif role == "assistant":
|
||||||
|
role = "user"
|
||||||
|
if role != "system":
|
||||||
|
self.message.append({"role": role, "content": content})
|
||||||
|
self.append_generate_chat(None) # will automatically advance the conversation as 'user'
|
||||||
|
last_message = self.messages[-1]
|
||||||
|
last_message["role"] = "user"
|
||||||
|
self.messages = message_backup + [last_message]
|
||||||
|
self.append_generate_chat(None) # 'regular' chatbot answer
|
||||||
|
|
||||||
|
elif input_text.startswith("/summarize"):
|
||||||
|
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
|
||||||
|
messages_temp = [self.summarize] + messages_temp + [self.summarize_user] # copy dict in last instance
|
||||||
|
# messages_temp[-1]["role"] = "user"
|
||||||
|
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
||||||
|
generated_tokens, full_output = self.inference.generate(input_ids)
|
||||||
|
|
||||||
|
elif input_text.startswith("/title"):
|
||||||
|
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
|
||||||
|
messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
||||||
|
messages_temp[-1]["role"] = "user"
|
||||||
|
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
||||||
|
generated_tokens, full_output = self.inference.generate(input_ids)
|
||||||
|
|
||||||
|
elif input_text.startswith("/save"):
|
||||||
|
with open("messages.json", "w") as f:
|
||||||
|
json.dump(self.messages, f, indent=4)
|
||||||
|
|
||||||
|
elif input_text.startswith("/load"):
|
||||||
|
with open("messages.json", "r") as f:
|
||||||
|
new_messages = json.load(f)
|
||||||
|
messages = [self.messages[0]] + new_messages[1:]
|
||||||
|
|
||||||
|
elif input_text.startswith("/help"):
|
||||||
|
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
||||||
|
print("/clear clear chat history")
|
||||||
|
print("/undo undo latest prompt")
|
||||||
|
print("/regen regenerate the last message")
|
||||||
|
print("/more generate more additional information")
|
||||||
|
print("/file read prompt input from file")
|
||||||
|
print("/auto automatically advance conversation")
|
||||||
|
print("/summarize generate a summary of the chat")
|
||||||
|
print("/title generate a title of the chat")
|
||||||
|
print("/save write chat history to file")
|
||||||
|
print("/load load previously saved history")
|
||||||
|
print("/help print this message")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
elif input_text.startswith("/"):
|
||||||
|
print("unknown command.")
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.append_generate_chat(input_text)
|
79
inference.py
79
inference.py
@@ -17,41 +17,49 @@ import time
|
|||||||
import utils
|
import utils
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
from modelconfig import Modelconfig
|
||||||
|
|
||||||
torch.set_num_threads(os.cpu_count()) # Adjust this to the number of threads/cores you have
|
torch.set_num_threads(os.cpu_count()) # Adjust this to the number of threads/cores you have
|
||||||
|
|
||||||
|
|
||||||
class Inference:
|
class Inference:
|
||||||
def __init__(self):
|
def __init__(self, modelconfig: Modelconfig):
|
||||||
print("loading LLM...")
|
print("loading LLM '%s'..." % modelconfig.model_name)
|
||||||
t_start = time.time()
|
t_start = time.time()
|
||||||
|
|
||||||
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
# model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
|
# model_name = "unsloth/phi-4-unsloth-bnb-4bit" #too big
|
||||||
# model_name = "gpt2"
|
# model_name = "gpt2"
|
||||||
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
||||||
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
||||||
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
||||||
|
|
||||||
|
|
||||||
quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
# quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||||
load_in_4bit=True,
|
# load_in_4bit=True,
|
||||||
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
# bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||||
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
# bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||||
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
# bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||||
)
|
# )
|
||||||
|
|
||||||
quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
|
|
||||||
# Load the model with quantization (optional)
|
# Load the model with quantization (optional)
|
||||||
self.model = AutoModelForCausalLM.from_pretrained(
|
if modelconfig.bits_and_bytes_config != None:
|
||||||
model_name,
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
# device_map="auto", # Automatically places parts of the model on GPU/CPU
|
modelconfig.model_name,
|
||||||
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
# device_map="auto", # Automatically places parts of the model on GPU/CPU
|
||||||
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||||
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||||
quantization_config=quantization_config_8bit
|
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
||||||
)
|
quantization_config=modelconfig.bits_and_bytes_config
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
modelconfig.model_name,
|
||||||
|
device_map="cuda",
|
||||||
|
)
|
||||||
|
|
||||||
# print("apply optimization")
|
# print("apply optimization")
|
||||||
# self.model.generation_config.cache_implementation = "static"
|
# self.model.generation_config.cache_implementation = "static"
|
||||||
@@ -59,25 +67,25 @@ class Inference:
|
|||||||
|
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
self.tokenizer = AutoTokenizer.from_pretrained(modelconfig.model_name)
|
||||||
|
|
||||||
print("load took %.3fs" % (time.time() - t_start))
|
print("load took %.3fs" % (time.time() - t_start))
|
||||||
|
|
||||||
max_context_length = self.model.config.max_position_embeddings
|
self.max_context_length = self.model.config.max_position_embeddings
|
||||||
|
|
||||||
|
|
||||||
self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
|
self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
|
||||||
|
|
||||||
print("max_context_length is %d tokens." % (max_context_length))
|
print("max_context_length is %d tokens." % (self.max_context_length))
|
||||||
|
|
||||||
|
|
||||||
def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def generate(self, input_ids: torch.Tensor, print_stdout=True) -> tuple[torch.Tensor, str]:
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
return self.generate_incremental_2(input_ids)
|
return self.generate_incremental_2(input_ids, print_stdout)
|
||||||
|
|
||||||
|
|
||||||
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def generate_batch(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||||
outputs = self.model.generate(
|
outputs = self.model.generate(
|
||||||
input_ids, # **inputs, inputs["input_ids"]
|
input_ids, # **inputs, inputs["input_ids"]
|
||||||
max_new_tokens=500, # max_length=max_context_length,
|
max_new_tokens=500, # max_length=max_context_length,
|
||||||
@@ -90,11 +98,12 @@ class Inference:
|
|||||||
# skip all input tokens and only output the additional generated part of the conversation
|
# skip all input tokens and only output the additional generated part of the conversation
|
||||||
input_token_count = len(input_ids[0])
|
input_token_count = len(input_ids[0])
|
||||||
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
||||||
print(out_text)
|
if print_stdout:
|
||||||
|
print(out_text)
|
||||||
return outputs, out_text
|
return outputs, out_text
|
||||||
|
|
||||||
|
|
||||||
def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def generate_incremental_2(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||||
generated_tokens = input_ids
|
generated_tokens = input_ids
|
||||||
|
|
||||||
past_key_values = DynamicCache()
|
past_key_values = DynamicCache()
|
||||||
@@ -126,12 +135,14 @@ class Inference:
|
|||||||
# Decode and print the newly generated token (skip special tokens)
|
# Decode and print the newly generated token (skip special tokens)
|
||||||
# out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
# out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
||||||
out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
|
out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||||
print(out_text, end="", flush=True) # Print without newline
|
if print_stdout:
|
||||||
|
print(out_text, end="", flush=True) # Print without newline
|
||||||
|
|
||||||
# Check if the generated token is the end-of-sequence token
|
# Check if the generated token is the end-of-sequence token
|
||||||
# if next_token.item() == self.tokenizer.eos_token_id:
|
# if next_token.item() == self.tokenizer.eos_token_id:
|
||||||
if new_tokens[-1].item() == self.tokenizer.eos_token_id:
|
if new_tokens[-1].item() == self.tokenizer.eos_token_id:
|
||||||
print("")
|
if print_stdout:
|
||||||
|
print("")
|
||||||
break
|
break
|
||||||
|
|
||||||
# n += 1
|
# n += 1
|
||||||
@@ -150,12 +161,12 @@ class Inference:
|
|||||||
return generated_tokens, full_output
|
return generated_tokens, full_output
|
||||||
|
|
||||||
|
|
||||||
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
return self._generate_incremental(input_ids)
|
return self._generate_incremental(input_ids, print_stdout)
|
||||||
|
|
||||||
|
|
||||||
def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def _generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||||
# Start with the initial input tokens
|
# Start with the initial input tokens
|
||||||
generated_tokens = input_ids # Initially, this is just the input tokens
|
generated_tokens = input_ids # Initially, this is just the input tokens
|
||||||
|
|
||||||
@@ -183,11 +194,13 @@ class Inference:
|
|||||||
|
|
||||||
# Decode and print the newly generated token (skip special tokens)
|
# Decode and print the newly generated token (skip special tokens)
|
||||||
out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
||||||
print(out_text, end="", flush=True) # Print without newline
|
if print_stdout:
|
||||||
|
print(out_text, end="", flush=True) # Print without newline
|
||||||
|
|
||||||
# Check if the generated token is the end-of-sequence token
|
# Check if the generated token is the end-of-sequence token
|
||||||
if next_token.item() == self.tokenizer.eos_token_id:
|
if next_token.item() == self.tokenizer.eos_token_id:
|
||||||
print("")
|
if print_stdout:
|
||||||
|
print("")
|
||||||
break
|
break
|
||||||
|
|
||||||
n += 1
|
n += 1
|
||||||
|
76
inference_profile_test.py
Normal file
76
inference_profile_test.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
from inference import Inference
|
||||||
|
from modelconfig import Modelconfig
|
||||||
|
import time
|
||||||
|
import nvidia_smi
|
||||||
|
import torch
|
||||||
|
import gc
|
||||||
|
|
||||||
|
|
||||||
|
def empty_cuda():
|
||||||
|
while True:
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
time.sleep(0.5)
|
||||||
|
vram = nvidia_smi.get_gpu_stats()["memory_used"]
|
||||||
|
print("vram: %d MB" % vram)
|
||||||
|
if vram < 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def profile_ex(model_conf: Modelconfig):
|
||||||
|
print("")
|
||||||
|
empty_cuda()
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."},
|
||||||
|
{"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"},
|
||||||
|
]
|
||||||
|
|
||||||
|
gpu_stats_before = nvidia_smi.get_gpu_stats()
|
||||||
|
inference = Inference(model_conf)
|
||||||
|
|
||||||
|
gpu_stats_loaded = nvidia_smi.get_gpu_stats()
|
||||||
|
t_start = time.time()
|
||||||
|
input_ids = inference.tokenize(messages, tokenize=True)
|
||||||
|
generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False)
|
||||||
|
t_end = time.time()
|
||||||
|
gpu_stats_after = nvidia_smi.get_gpu_stats()
|
||||||
|
|
||||||
|
took = t_end - t_start
|
||||||
|
tokens = len(generated_tokens[0])
|
||||||
|
tokens_per = tokens / took
|
||||||
|
vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"]
|
||||||
|
vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"]
|
||||||
|
print("model: %s" % model_conf.model_name)
|
||||||
|
print("tokens: %d tk" % tokens)
|
||||||
|
print("time: %.3f s" % took)
|
||||||
|
print("speed: %.3f tk/s" % tokens_per)
|
||||||
|
print("vram_bulk: %d MB" % vram_bulk)
|
||||||
|
print("vram_top: %d MB" % vram_top)
|
||||||
|
print("context: %d tk" % inference.max_context_length)
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
def profile(model_conf):
|
||||||
|
try:
|
||||||
|
profile_ex(model_conf)
|
||||||
|
except Exception as e:
|
||||||
|
print("exception: " + str(e))
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
|
||||||
|
profile(Modelconfig("unsloth/Llama-3.2-1B"))
|
||||||
|
profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
|
||||||
|
profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit"))
|
||||||
|
# profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True))
|
||||||
|
profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit"))
|
||||||
|
profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit"))
|
||||||
|
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
|
||||||
|
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True))
|
||||||
|
profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
188
llama.py
188
llama.py
@@ -1,175 +1,43 @@
|
|||||||
import time
|
|
||||||
import random
|
from tool_helper import tool_list
|
||||||
from tool_helper import tool_list, parse_and_execute_tool_call
|
|
||||||
from tool_functions import register_dummy
|
from tool_functions import register_dummy
|
||||||
from inference import Inference, torch_reseed
|
from inference import Inference
|
||||||
import datetime
|
import datetime
|
||||||
|
import model_selection
|
||||||
|
from generation_loop import Terminal, msg
|
||||||
|
|
||||||
messages = []
|
|
||||||
inference = None
|
|
||||||
|
|
||||||
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
|
|
||||||
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
|
|
||||||
|
|
||||||
# system message for role flip so the model automatically answers for the user
|
|
||||||
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
|
|
||||||
|
|
||||||
# system messages and user message to bring the model to summarize the entire conversation
|
|
||||||
summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."}
|
|
||||||
summarize_user = {"role": "system", "content": "Can you summarize the conversation?"}
|
|
||||||
|
|
||||||
# system message to create a conversation title
|
|
||||||
title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
register_dummy()
|
register_dummy()
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_config(inference: Inference) -> Terminal:
|
||||||
|
|
||||||
|
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
|
||||||
def append_generate_chat(input_text: str, role="user"):
|
system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information."
|
||||||
t_start = time.time()
|
|
||||||
|
|
||||||
# generate AI response
|
|
||||||
if input_text != None:
|
|
||||||
messages.append({"role": role, "content": input_text})
|
|
||||||
|
|
||||||
inputs = inference.tokenize(messages, tokenize=True)
|
|
||||||
|
|
||||||
outputs, out_text = inference.generate(inputs)
|
|
||||||
|
|
||||||
# append result to message history
|
|
||||||
messages.append({"role": "assistant", "content": out_text})
|
|
||||||
|
|
||||||
print("")
|
|
||||||
print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0])))
|
|
||||||
|
|
||||||
# handle tool call and check if a tool call has happened.
|
|
||||||
tool_result = parse_and_execute_tool_call(out_text, tool_list)
|
|
||||||
if tool_result != None:
|
|
||||||
# tool call happened
|
|
||||||
tool_result = "<tool_response>%s</tool_response>" % tool_result
|
|
||||||
# depending on the chat template the tool response tags must or must not be passed. :(
|
|
||||||
append_generate_chat(tool_result, role="tool")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
global messages
|
|
||||||
global inference
|
|
||||||
|
|
||||||
inference = Inference()
|
|
||||||
|
|
||||||
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
|
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
|
||||||
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
|
append_toolcalls = False
|
||||||
|
if append_toolcalls:
|
||||||
|
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list))
|
||||||
|
else:
|
||||||
|
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time)
|
||||||
|
|
||||||
while True:
|
terminal = Terminal(inference, systemmessage)
|
||||||
# print an input prompt to receive text or commands
|
|
||||||
input_text = input(">>> ")
|
|
||||||
print("")
|
|
||||||
|
|
||||||
|
# system message for role flip so the model automatically answers for the user
|
||||||
if input_text.startswith("!"):
|
terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.")
|
||||||
append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
|
|
||||||
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :(
|
# system messages and user message to bring the model to summarize the entire conversation
|
||||||
|
terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.")
|
||||||
elif input_text.startswith("/clear"):
|
terminal.summarize_user = msg("system", "Can you summarize the conversation?")
|
||||||
print("clearing chat history")
|
|
||||||
start_msg = messages[0]
|
# system message to create a conversation title
|
||||||
messages = [start_msg]
|
terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.")
|
||||||
print("")
|
return terminal
|
||||||
|
|
||||||
elif input_text.startswith("/history"):
|
|
||||||
history = inference.tokenize(messages, tokenize=False)
|
|
||||||
# history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
|
||||||
print(history)
|
|
||||||
|
|
||||||
elif input_text.startswith("/undo"):
|
|
||||||
if len(messages) > 2:
|
|
||||||
print("undo latest prompt")
|
|
||||||
messages = messages[:-2]
|
|
||||||
else:
|
|
||||||
print("cannot undo because there are not enough messages on history.")
|
|
||||||
print("")
|
|
||||||
|
|
||||||
elif input_text.startswith("/regen"):
|
|
||||||
if len(messages) >= 2:
|
|
||||||
print("regenerating message (not working)")
|
|
||||||
messages = messages[:-1]
|
|
||||||
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
|
||||||
torch_reseed(seed)
|
|
||||||
append_generate_chat(None)
|
|
||||||
else:
|
|
||||||
print("cannot regenerate because there are not enough messages on history.")
|
|
||||||
print("")
|
|
||||||
|
|
||||||
elif input_text.startswith("/more"):
|
|
||||||
append_generate_chat(None)
|
|
||||||
|
|
||||||
elif input_text.startswith("/file"):
|
|
||||||
filename = input_text[len("/file "):]
|
|
||||||
print("read '%s' for prompt:" % filename)
|
|
||||||
with open(filename, "r") as f:
|
|
||||||
content = f.read()
|
|
||||||
print(content)
|
|
||||||
append_generate_chat(content)
|
|
||||||
|
|
||||||
elif input_text.startswith("/auto"):
|
|
||||||
messages_backup = messages
|
|
||||||
messages = [roleflip]
|
|
||||||
for m in messages_backup:
|
|
||||||
role = m["role"]
|
|
||||||
content = m["content"]
|
|
||||||
if role == "user":
|
|
||||||
role = "assistant"
|
|
||||||
elif role == "assistant":
|
|
||||||
role = "user"
|
|
||||||
if role != "system":
|
|
||||||
messages.append({"role": role, "content": content})
|
|
||||||
append_generate_chat(None) # will automatically advance the conversation as 'user'
|
|
||||||
last_message = messages[-1]
|
|
||||||
last_message["role"] = "user"
|
|
||||||
messages = messages_backup + [last_message]
|
|
||||||
append_generate_chat(None) # 'regular' chatbot answer
|
|
||||||
|
|
||||||
elif input_text.startswith("/summarize"):
|
|
||||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
|
||||||
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance
|
|
||||||
# messages_temp[-1]["role"] = "user"
|
|
||||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
|
||||||
generated_tokens, full_output = inference.generate(input_ids)
|
|
||||||
|
|
||||||
elif input_text.startswith("/title"):
|
|
||||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
|
||||||
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
|
||||||
messages_temp[-1]["role"] = "user"
|
|
||||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
|
||||||
generated_tokens, full_output = inference.generate(input_ids)
|
|
||||||
|
|
||||||
elif input_text.startswith("/help"):
|
|
||||||
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
|
||||||
print("/clear clear chat history")
|
|
||||||
print("/undo undo latest prompt")
|
|
||||||
print("/regen regenerate the last message")
|
|
||||||
print("/more generate more additional information")
|
|
||||||
print("/file read prompt input from file")
|
|
||||||
print("/auto automatically advance conversation")
|
|
||||||
print("/summarize generate a summary of the chat")
|
|
||||||
print("/title generate a title of the chat")
|
|
||||||
print("/help print this message")
|
|
||||||
print("")
|
|
||||||
|
|
||||||
elif input_text.startswith("/"):
|
|
||||||
print("unknown command.")
|
|
||||||
|
|
||||||
else:
|
|
||||||
append_generate_chat(input_text)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
|
||||||
|
inference = Inference(model_selection.get_model())
|
||||||
|
terminal = initialize_config(inference)
|
||||||
|
terminal.join()
|
95
model_selection.py
Normal file
95
model_selection.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
|
||||||
|
from modelconfig import Modelconfig
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_model() -> Modelconfig:
|
||||||
|
|
||||||
|
# model: NousResearch/Hermes-3-Llama-3.2-3B
|
||||||
|
# tokens: 315 tk
|
||||||
|
# time: 94.360 s
|
||||||
|
# speed: 3.338 tk/s
|
||||||
|
# vram_bulk: 3622 MB
|
||||||
|
# vram_top: 80 MB
|
||||||
|
# context: 131072 tk
|
||||||
|
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
|
||||||
|
|
||||||
|
# model: unsloth/Llama-3.2-1B
|
||||||
|
# tokens: 589 tk
|
||||||
|
# time: 39.348 s
|
||||||
|
# speed: 14.969 tk/s
|
||||||
|
# vram_bulk: 4708 MB
|
||||||
|
# vram_top: 102 MB
|
||||||
|
# context: 131072 tk
|
||||||
|
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work.
|
||||||
|
|
||||||
|
# model: unsloth/Llama-3.2-3B-Instruct
|
||||||
|
# tokens: 285 tk
|
||||||
|
# time: 75.363 s
|
||||||
|
# speed: 3.782 tk/s
|
||||||
|
# vram_bulk: 3512 MB
|
||||||
|
# vram_top: 48 MB
|
||||||
|
# context: 131072 tk
|
||||||
|
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
|
||||||
|
|
||||||
|
# model: unsloth/llama-3-8b-bnb-4bit
|
||||||
|
# tokens: 435 tk
|
||||||
|
# time: 84.314 s
|
||||||
|
# speed: 5.159 tk/s
|
||||||
|
# vram_bulk: 5440 MB
|
||||||
|
# vram_top: 216 MB
|
||||||
|
# context: 8192 tk
|
||||||
|
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
|
||||||
|
|
||||||
|
# Model size: 3.21B params
|
||||||
|
# vram used: xxxxx MB
|
||||||
|
# speed xxxxx t/s
|
||||||
|
# working: DOES NOT LOAD
|
||||||
|
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
|
||||||
|
|
||||||
|
# model: unsloth/gemma-2-9b-it-bnb-4bit
|
||||||
|
# tokens: 154 tk
|
||||||
|
# time: 32.727 s
|
||||||
|
# speed: 4.706 tk/s
|
||||||
|
# vram_bulk: 6156 MB
|
||||||
|
# vram_top: 232 MB
|
||||||
|
# context: 8192 tk
|
||||||
|
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
|
||||||
|
|
||||||
|
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
|
||||||
|
# tokens: 120 tk
|
||||||
|
# time: 12.248 s
|
||||||
|
# speed: 9.798 tk/s
|
||||||
|
# vram_bulk: 5382 MB
|
||||||
|
# vram_top: 170 MB
|
||||||
|
# context: 32768 tk
|
||||||
|
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good
|
||||||
|
|
||||||
|
# model: unsloth/Qwen2.5-3B-Instruct
|
||||||
|
# tokens: 112 tk
|
||||||
|
# time: 12.703 s
|
||||||
|
# speed: 8.816 tk/s
|
||||||
|
# vram_bulk: 2108 MB
|
||||||
|
# vram_top: 98 MB
|
||||||
|
# context: 32768 tk
|
||||||
|
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
|
||||||
|
|
||||||
|
# model: unsloth/Qwen2.5-3B-Instruct
|
||||||
|
# tokens: 118 tk
|
||||||
|
# time: 33.748 s
|
||||||
|
# speed: 3.497 tk/s
|
||||||
|
# vram_bulk: 3310 MB
|
||||||
|
# vram_top: 60 MB
|
||||||
|
# context: 32768 tk
|
||||||
|
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
|
||||||
|
|
||||||
|
# Model size: 3.87B params
|
||||||
|
# vram used: xxxxx MB
|
||||||
|
# speed xxxxx t/s
|
||||||
|
# error: requires the protobuf library but it was not found in your environment
|
||||||
|
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
20
modelconfig.py
Normal file
20
modelconfig.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
|
||||||
|
from transformers import BitsAndBytesConfig
|
||||||
|
import torch
|
||||||
|
|
||||||
|
class Modelconfig:
|
||||||
|
def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False):
|
||||||
|
self.model_name = model_name
|
||||||
|
if load_in_4bit:
|
||||||
|
assert bits_and_bytes_config == None
|
||||||
|
self.bits_and_bytes_config = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||||
|
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||||
|
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||||
|
)
|
||||||
|
elif load_in_8bit:
|
||||||
|
assert bits_and_bytes_config == None
|
||||||
|
self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
|
else:
|
||||||
|
self.bits_and_bytes_config = bits_and_bytes_config
|
@@ -1,3 +1,4 @@
|
|||||||
transformers
|
transformers
|
||||||
accelerate
|
accelerate
|
||||||
bitsandbytes
|
bitsandbytes
|
||||||
|
pytest
|
Reference in New Issue
Block a user