lift inference part our of llama.py

6 months ago · 4d034c7f2b
2 changed files with 258 additions and 99 deletions
--- a/inference.py
+++ b/inference.py
@ -0,0 +1,148 @@
 if __name__ == "__main__":
    # this message is at the start, because initializing torch/transformers takes lots of time. fail fast.
    raise Exception("cannot execute this file directly")
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 import time
 import utils
 import re
 class Inference:
    def __init__(self):
        print("loading LLM...")
        t_start = time.time()
        # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
        model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
        # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
        # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
        # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
        quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",  # Recommended for better performance
            bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
            bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
        )
        quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
        # Load the model with quantization (optional)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            # device_map="auto",  # Automatically places parts of the model on GPU/CPU
            # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
            device_map="cuda",  # Automatically places parts of the model on GPU/CPU
            # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
            quantization_config=quantization_config_8bit
        )
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        print("load took %.3fs" % (time.time() - t_start))
        max_context_length = self.model.config.max_position_embeddings
        self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
        print("max_context_length is %d tokens." % (max_context_length))
    def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        outputs = self.model.generate(
            input_ids,  # **inputs, inputs["input_ids"]
            max_new_tokens=500,  # max_length=max_context_length,
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=True,
            num_return_sequences=1
        )
        # skip all input tokens and only output the additional generated part of the conversation
        input_token_count = len(input_ids[0])
        out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
        print(out_text)
        return outputs, out_text
    def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        with torch.inference_mode():
            return self._generate_incremental(input_ids)
    def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        # Start with the initial input tokens
        generated_tokens = input_ids  # Initially, this is just the input tokens
        n = 0
        try:
            # Loop to generate one token at a time
            while True:
                # Call the model with the current tokens
                outputs = self.model(input_ids=generated_tokens, use_cache=True)
                # Get the next token (the last token from the generated sequence)
                next_token = outputs.logits.argmax(dim=-1)[:, -1]
                # Append the new token to the sequence
                generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
                # Decode and print the newly generated token (skip special tokens)
                out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
                print(out_text, end="", flush=True)  # Print without newline
                # Check if the generated token is the end-of-sequence token
                if next_token.item() == self.tokenizer.eos_token_id:
                    print("")
                    break
                n += 1
                if n >= 15:
                    n = 0
                    torch.cuda.empty_cache()
        except KeyboardInterrupt:
            pass
        # Once done, return the full generated sequence
        input_token_count = len(input_ids[0])
        full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
        torch.cuda.empty_cache()
        return generated_tokens, full_output
    def tokenize(self, messages: list[dict], tokenize: bool) -> str | torch.Tensor:
        if tokenize:
            inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
            inputs = {key: value.to(self.model.device) for key, value in inputs.items()}
            return inputs["input_ids"]
        else:
            message = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
            return message
    def generate_tool_use_header(self, tools: list[callable]) -> str:
        temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
        s = self.tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
        pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
        match = re.search(pattern, s, re.DOTALL)
        if not match:
            raise Exception("Failed to regex match the template tool system text.")
        extraction = match.group(1)
        return extraction
 def torch_reseed(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
--- a/llama.py
+++ b/llama.py
@ -1,54 +1,55 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import time
-import torch
+# import torch
 import random
 from tool_helper import tool_list, parse_and_execute_tool_call
 from tool_functions import register_dummy
-import utils
+# import utils
-import re
+# import re
 from inference import Inference, torch_reseed
-t_start = time.time()
+# t_start = time.time()
-# model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+# # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
-model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+# model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
-# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
+# # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
-# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
+# # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
-# "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
+# # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
-quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
+# quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
-    load_in_4bit=True,
+#     load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",  # Recommended for better performance
+#     bnb_4bit_quant_type="nf4",  # Recommended for better performance
-    bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
+#     bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
-    bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
+#     bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
-)
+# )
-quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
+# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
-# Load the model with quantization (optional)
+# # Load the model with quantization (optional)
-model = AutoModelForCausalLM.from_pretrained(
+# model = AutoModelForCausalLM.from_pretrained(
-    model_name,
+#     model_name,
-    # device_map="auto",  # Automatically places parts of the model on GPU/CPU
+#     # device_map="auto",  # Automatically places parts of the model on GPU/CPU
-    # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+#     # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
-    device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+#     device_map="cuda",  # Automatically places parts of the model on GPU/CPU
-    # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
+#     # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
-    quantization_config=quantization_config_8bit
+#     quantization_config=quantization_config_8bit
-)
+# )
-# Load tokenizer
+# # Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+# tokenizer = AutoTokenizer.from_pretrained(model_name)
-print("load took %.3fs" % (time.time() - t_start))
+# print("load took %.3fs" % (time.time() - t_start))
-max_context_length = model.config.max_position_embeddings
+# max_context_length = model.config.max_position_embeddings
-tokenizer.chat_template = utils.load_json_file("chat_template.json")
+# tokenizer.chat_template = utils.load_json_file("chat_template.json")
-print("max_context_length is %d tokens." % (max_context_length))
+# print("max_context_length is %d tokens." % (max_context_length))
 # Generate text
@ -95,6 +96,8 @@ messages = [
    # {"role": "user", "content": "Hello, who are you?"}
 ]
 inference = None
 systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."
 roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
@ -106,67 +109,67 @@ register_dummy()
-def generate_batch(inputs):
+# def generate_batch(inputs):
-    outputs = model.generate(
+#     outputs = model.generate(
-        inputs["input_ids"],  # **inputs,
+#         inputs["input_ids"],  # **inputs,
-        max_new_tokens=500,  # max_length=max_context_length,
+#         max_new_tokens=500,  # max_length=max_context_length,
-        pad_token_id=tokenizer.pad_token_id,
+#         pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
+#         eos_token_id=tokenizer.eos_token_id,
-        do_sample=True,
+#         do_sample=True,
-        num_return_sequences=1
+#         num_return_sequences=1
-    )
+#     )
-    # skip all input tokens and only output the additional generated part of the conversation
+#     # skip all input tokens and only output the additional generated part of the conversation
-    input_token_count = len(inputs["input_ids"][0])
+#     input_token_count = len(inputs["input_ids"][0])
-    out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
+#     out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
-    print(out_text)
+#     print(out_text)
-    return outputs, out_text
+#     return outputs, out_text
-def generate_incremental(inputs):
+# def generate_incremental(inputs):
-    # Start with the initial input tokens
+#     # Start with the initial input tokens
-    input_ids = inputs["input_ids"]
+#     input_ids = inputs["input_ids"]
-    generated_tokens = input_ids  # Initially, this is just the input tokens
+#     generated_tokens = input_ids  # Initially, this is just the input tokens
-    n = 0
+#     n = 0
-    try:
+#     try:
-        # Loop to generate one token at a time
+#         # Loop to generate one token at a time
-        while True:
+#         while True:
-            # Call the model with the current tokens
+#             # Call the model with the current tokens
-            outputs = model(input_ids=generated_tokens, use_cache=True)
+#             outputs = model(input_ids=generated_tokens, use_cache=True)
-            # Get the next token (the last token from the generated sequence)
+#             # Get the next token (the last token from the generated sequence)
-            next_token = outputs.logits.argmax(dim=-1)[:, -1]
+#             next_token = outputs.logits.argmax(dim=-1)[:, -1]
-            # Append the new token to the sequence
+#             # Append the new token to the sequence
-            generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
+#             generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
-            # Decode and print the newly generated token (skip special tokens)
+#             # Decode and print the newly generated token (skip special tokens)
-            out_text = tokenizer.decode(next_token, skip_special_tokens=True)
+#             out_text = tokenizer.decode(next_token, skip_special_tokens=True)
-            print(out_text, end="", flush=True)  # Print without newline
+#             print(out_text, end="", flush=True)  # Print without newline
-            # Check if the generated token is the end-of-sequence token
+#             # Check if the generated token is the end-of-sequence token
-            if next_token.item() == tokenizer.eos_token_id:
+#             if next_token.item() == tokenizer.eos_token_id:
-                print("")
+#                 print("")
-                break
+#                 break
-            n += 1
+#             n += 1
-            if n >= 15:
+#             if n >= 15:
-                n = 0
+#                 n = 0
-                torch.cuda.empty_cache()
+#                 torch.cuda.empty_cache()
-    except KeyboardInterrupt:
+#     except KeyboardInterrupt:
-        pass
+#         pass
-    # Once done, return the full generated sequence
+#     # Once done, return the full generated sequence
-    input_token_count = len(inputs["input_ids"][0])
+#     input_token_count = len(inputs["input_ids"][0])
-    full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
+#     full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
-    torch.cuda.empty_cache()
+#     torch.cuda.empty_cache()
-    return generated_tokens, full_output
+#     return generated_tokens, full_output
 def append_generate_chat(input_text: str, role="user"):
@ -176,15 +179,16 @@ def append_generate_chat(input_text: str, role="user"):
    if input_text != None:
        messages.append({"role": role, "content": input_text})
-    # input_text = "Hello, who are you?"
+    # # input_text = "Hello, who are you?"
-    # inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")
+    # # inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
+    # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
-    inputs = {key: value.to(model.device) for key, value in inputs.items()}
+    # inputs = {key: value.to(model.device) for key, value in inputs.items()}
-    # inputs = {key: value.to("cpu") for key, value in inputs.items()}
+    # # inputs = {key: value.to("cpu") for key, value in inputs.items()}
-    # inputs["input_ids"] = inputs["input_ids"][:, 1:]
+    # # inputs["input_ids"] = inputs["input_ids"][:, 1:]
    inputs = inference.tokenize(messages, tokenize=True)
-    with torch.inference_mode():
+    outputs, out_text = inference.generate_incremental(inputs)
        outputs, out_text = generate_incremental(inputs)
    # append result to message history
    messages.append({"role": "assistant", "content": out_text})
@ -202,21 +206,24 @@ def append_generate_chat(input_text: str, role="user"):
-def generate_tool_use_header(tools: list[callable]) -> str:
+# def generate_tool_use_header(tools: list[callable]) -> str:
-    temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
+#     temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
-    s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
+#     s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
-    pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
+#     pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
-    match = re.search(pattern, s, re.DOTALL)
+#     match = re.search(pattern, s, re.DOTALL)
-    if not match:
+#     if not match:
-        raise Exception("Failed to regex match the template tool system text.")
+#         raise Exception("Failed to regex match the template tool system text.")
-    extraction = match.group(1)
+#     extraction = match.group(1)
-    return extraction
+#     return extraction
 def main():
    global messages
    global inference
    inference = Inference()
-    messages = [{"role": "system", "content": systemmessage + "\n" + generate_tool_use_header(tool_list)}]
+    messages = [{"role": "system", "content": systemmessage + "\n" + inference.generate_tool_use_header(tool_list)}]
    while True:
        # print an input prompt to receive text or commands
@ -235,7 +242,8 @@ def main():
            print("")
        elif input_text.startswith("/history"):
-            history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
+            history = inference.tokenize(messages, tokenize=False)
            # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
            print(history)
        elif input_text.startswith("/undo"):
@ -251,8 +259,7 @@ def main():
                print("regenerating message (not working)")
                messages = messages[:-1]
                seed = random.randint(0, 2**32 - 1)  # Generate a random seed
-                torch.manual_seed(seed)
+                torch_reseed(seed)
                torch.cuda.manual_seed_all(seed)
                append_generate_chat(None)
            else:
                print("cannot regenerate because there are not enough messages on history.")
@ -304,3 +311,7 @@ def main():
        else:
            append_generate_chat(input_text)
 if __name__ == "__main__":
    main()