diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..41b418c --- /dev/null +++ b/inference.py @@ -0,0 +1,148 @@ +if __name__ == "__main__": + # this message is at the start, because initializing torch/transformers takes lots of time. fail fast. + raise Exception("cannot execute this file directly") + + +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +import torch +import time +import utils +import re + + + +class Inference: + def __init__(self): + print("loading LLM...") + t_start = time.time() + + # model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub + model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub + # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B" + # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" + # "meta-llama/Llama-2-7b-hf" # Replace with your chosen model + + + quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode + load_in_4bit=True, + bnb_4bit_quant_type="nf4", # Recommended for better performance + bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving + bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation + ) + + quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True) + + # Load the model with quantization (optional) + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + # device_map="auto", # Automatically places parts of the model on GPU/CPU + # device_map="cuda", # Automatically places parts of the model on GPU/CPU + device_map="cuda", # Automatically places parts of the model on GPU/CPU + # load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed + quantization_config=quantization_config_8bit + ) + + # Load tokenizer + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + + print("load took %.3fs" % (time.time() - t_start)) + + max_context_length = self.model.config.max_position_embeddings + + + self.tokenizer.chat_template = utils.load_json_file("chat_template.json") + + print("max_context_length is %d tokens." % (max_context_length)) + + + def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + outputs = self.model.generate( + input_ids, # **inputs, inputs["input_ids"] + max_new_tokens=500, # max_length=max_context_length, + pad_token_id=self.tokenizer.pad_token_id, + eos_token_id=self.tokenizer.eos_token_id, + do_sample=True, + num_return_sequences=1 + ) + # skip all input tokens and only output the additional generated part of the conversation + input_token_count = len(input_ids[0]) + out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True) + print(out_text) + return outputs, out_text + + + + def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + with torch.inference_mode(): + return self._generate_incremental(input_ids) + + + def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: + # Start with the initial input tokens + generated_tokens = input_ids # Initially, this is just the input tokens + + n = 0 + try: + + # Loop to generate one token at a time + while True: + # Call the model with the current tokens + outputs = self.model(input_ids=generated_tokens, use_cache=True) + + # Get the next token (the last token from the generated sequence) + next_token = outputs.logits.argmax(dim=-1)[:, -1] + + # Append the new token to the sequence + generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1) + + # Decode and print the newly generated token (skip special tokens) + out_text = self.tokenizer.decode(next_token, skip_special_tokens=True) + print(out_text, end="", flush=True) # Print without newline + + # Check if the generated token is the end-of-sequence token + if next_token.item() == self.tokenizer.eos_token_id: + print("") + break + + n += 1 + if n >= 15: + n = 0 + torch.cuda.empty_cache() + + except KeyboardInterrupt: + pass + + # Once done, return the full generated sequence + input_token_count = len(input_ids[0]) + full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True) + + torch.cuda.empty_cache() + + return generated_tokens, full_output + + + def tokenize(self, messages: list[dict], tokenize: bool) -> str | torch.Tensor: + if tokenize: + inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True) #continue_final_message=True, + inputs = {key: value.to(self.model.device) for key, value in inputs.items()} + return inputs["input_ids"] + else: + message = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False) + return message + + + def generate_tool_use_header(self, tools: list[callable]) -> str: + temp_messages = [{}] # for some reason an empty array is not allowed but a {} inside works like an empty array. + s = self.tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools) + pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>" + match = re.search(pattern, s, re.DOTALL) + if not match: + raise Exception("Failed to regex match the template tool system text.") + extraction = match.group(1) + return extraction + + +def torch_reseed(seed: int): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + diff --git a/llama.py b/llama.py index 5e72830..5253243 100644 --- a/llama.py +++ b/llama.py @@ -1,54 +1,55 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import time -import torch +# import torch import random from tool_helper import tool_list, parse_and_execute_tool_call from tool_functions import register_dummy -import utils -import re +# import utils +# import re +from inference import Inference, torch_reseed -t_start = time.time() +# t_start = time.time() -# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub -model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub -# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B" -# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" -# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model +# # model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub +# model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub +# # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B" +# # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" +# # "meta-llama/Llama-2-7b-hf" # Replace with your chosen model -quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode - load_in_4bit=True, - bnb_4bit_quant_type="nf4", # Recommended for better performance - bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving - bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation -) +# quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode +# load_in_4bit=True, +# bnb_4bit_quant_type="nf4", # Recommended for better performance +# bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving +# bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation +# ) -quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True) +# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True) -# Load the model with quantization (optional) -model = AutoModelForCausalLM.from_pretrained( - model_name, - # device_map="auto", # Automatically places parts of the model on GPU/CPU - # device_map="cuda", # Automatically places parts of the model on GPU/CPU - device_map="cuda", # Automatically places parts of the model on GPU/CPU - # load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed - quantization_config=quantization_config_8bit -) +# # Load the model with quantization (optional) +# model = AutoModelForCausalLM.from_pretrained( +# model_name, +# # device_map="auto", # Automatically places parts of the model on GPU/CPU +# # device_map="cuda", # Automatically places parts of the model on GPU/CPU +# device_map="cuda", # Automatically places parts of the model on GPU/CPU +# # load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed +# quantization_config=quantization_config_8bit +# ) -# Load tokenizer -tokenizer = AutoTokenizer.from_pretrained(model_name) +# # Load tokenizer +# tokenizer = AutoTokenizer.from_pretrained(model_name) -print("load took %.3fs" % (time.time() - t_start)) +# print("load took %.3fs" % (time.time() - t_start)) -max_context_length = model.config.max_position_embeddings +# max_context_length = model.config.max_position_embeddings -tokenizer.chat_template = utils.load_json_file("chat_template.json") +# tokenizer.chat_template = utils.load_json_file("chat_template.json") -print("max_context_length is %d tokens." % (max_context_length)) +# print("max_context_length is %d tokens." % (max_context_length)) # Generate text @@ -95,6 +96,8 @@ messages = [ # {"role": "user", "content": "Hello, who are you?"} ] +inference = None + systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences." roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."} @@ -106,67 +109,67 @@ register_dummy() -def generate_batch(inputs): - outputs = model.generate( - inputs["input_ids"], # **inputs, - max_new_tokens=500, # max_length=max_context_length, - pad_token_id=tokenizer.pad_token_id, - eos_token_id=tokenizer.eos_token_id, - do_sample=True, - num_return_sequences=1 - ) - # skip all input tokens and only output the additional generated part of the conversation - input_token_count = len(inputs["input_ids"][0]) - out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True) - print(out_text) - return outputs, out_text +# def generate_batch(inputs): +# outputs = model.generate( +# inputs["input_ids"], # **inputs, +# max_new_tokens=500, # max_length=max_context_length, +# pad_token_id=tokenizer.pad_token_id, +# eos_token_id=tokenizer.eos_token_id, +# do_sample=True, +# num_return_sequences=1 +# ) +# # skip all input tokens and only output the additional generated part of the conversation +# input_token_count = len(inputs["input_ids"][0]) +# out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True) +# print(out_text) +# return outputs, out_text -def generate_incremental(inputs): - # Start with the initial input tokens - input_ids = inputs["input_ids"] - generated_tokens = input_ids # Initially, this is just the input tokens +# def generate_incremental(inputs): +# # Start with the initial input tokens +# input_ids = inputs["input_ids"] +# generated_tokens = input_ids # Initially, this is just the input tokens - n = 0 - try: +# n = 0 +# try: - # Loop to generate one token at a time - while True: - # Call the model with the current tokens - outputs = model(input_ids=generated_tokens, use_cache=True) +# # Loop to generate one token at a time +# while True: +# # Call the model with the current tokens +# outputs = model(input_ids=generated_tokens, use_cache=True) - # Get the next token (the last token from the generated sequence) - next_token = outputs.logits.argmax(dim=-1)[:, -1] +# # Get the next token (the last token from the generated sequence) +# next_token = outputs.logits.argmax(dim=-1)[:, -1] - # Append the new token to the sequence - generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1) +# # Append the new token to the sequence +# generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1) - # Decode and print the newly generated token (skip special tokens) - out_text = tokenizer.decode(next_token, skip_special_tokens=True) - print(out_text, end="", flush=True) # Print without newline +# # Decode and print the newly generated token (skip special tokens) +# out_text = tokenizer.decode(next_token, skip_special_tokens=True) +# print(out_text, end="", flush=True) # Print without newline - # Check if the generated token is the end-of-sequence token - if next_token.item() == tokenizer.eos_token_id: - print("") - break +# # Check if the generated token is the end-of-sequence token +# if next_token.item() == tokenizer.eos_token_id: +# print("") +# break - n += 1 - if n >= 15: - n = 0 - torch.cuda.empty_cache() +# n += 1 +# if n >= 15: +# n = 0 +# torch.cuda.empty_cache() - except KeyboardInterrupt: - pass +# except KeyboardInterrupt: +# pass - # Once done, return the full generated sequence - input_token_count = len(inputs["input_ids"][0]) - full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True) +# # Once done, return the full generated sequence +# input_token_count = len(inputs["input_ids"][0]) +# full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True) - torch.cuda.empty_cache() +# torch.cuda.empty_cache() - return generated_tokens, full_output +# return generated_tokens, full_output def append_generate_chat(input_text: str, role="user"): @@ -176,15 +179,16 @@ def append_generate_chat(input_text: str, role="user"): if input_text != None: messages.append({"role": role, "content": input_text}) - # input_text = "Hello, who are you?" - # inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu") - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True) #continue_final_message=True, - inputs = {key: value.to(model.device) for key, value in inputs.items()} - # inputs = {key: value.to("cpu") for key, value in inputs.items()} - # inputs["input_ids"] = inputs["input_ids"][:, 1:] + # # input_text = "Hello, who are you?" + # # inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu") + # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True) #continue_final_message=True, + # inputs = {key: value.to(model.device) for key, value in inputs.items()} + # # inputs = {key: value.to("cpu") for key, value in inputs.items()} + # # inputs["input_ids"] = inputs["input_ids"][:, 1:] + + inputs = inference.tokenize(messages, tokenize=True) - with torch.inference_mode(): - outputs, out_text = generate_incremental(inputs) + outputs, out_text = inference.generate_incremental(inputs) # append result to message history messages.append({"role": "assistant", "content": out_text}) @@ -202,21 +206,24 @@ def append_generate_chat(input_text: str, role="user"): -def generate_tool_use_header(tools: list[callable]) -> str: - temp_messages = [{}] # for some reason an empty array is not allowed but a {} inside works like an empty array. - s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools) - pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>" - match = re.search(pattern, s, re.DOTALL) - if not match: - raise Exception("Failed to regex match the template tool system text.") - extraction = match.group(1) - return extraction +# def generate_tool_use_header(tools: list[callable]) -> str: +# temp_messages = [{}] # for some reason an empty array is not allowed but a {} inside works like an empty array. +# s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools) +# pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>" +# match = re.search(pattern, s, re.DOTALL) +# if not match: +# raise Exception("Failed to regex match the template tool system text.") +# extraction = match.group(1) +# return extraction def main(): global messages + global inference + + inference = Inference() - messages = [{"role": "system", "content": systemmessage + "\n" + generate_tool_use_header(tool_list)}] + messages = [{"role": "system", "content": systemmessage + "\n" + inference.generate_tool_use_header(tool_list)}] while True: # print an input prompt to receive text or commands @@ -235,7 +242,8 @@ def main(): print("") elif input_text.startswith("/history"): - history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False) + history = inference.tokenize(messages, tokenize=False) + # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False) print(history) elif input_text.startswith("/undo"): @@ -251,8 +259,7 @@ def main(): print("regenerating message (not working)") messages = messages[:-1] seed = random.randint(0, 2**32 - 1) # Generate a random seed - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + torch_reseed(seed) append_generate_chat(None) else: print("cannot regenerate because there are not enough messages on history.") @@ -304,3 +311,7 @@ def main(): else: append_generate_chat(input_text) + + +if __name__ == "__main__": + main() \ No newline at end of file