lift inference part our of llama.py

2025-01-02 02:08:58 +01:00
parent c189df9547
commit 4d034c7f2b
2 changed files with 258 additions and 99 deletions
--- a/inference.py
+++ b/inference.py
@@ -0,0 +1,148 @@
+if __name__ == "__main__":
+    # this message is at the start, because initializing torch/transformers takes lots of time. fail fast.
+    raise Exception("cannot execute this file directly")
+
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import torch
+import time
+import utils
+import re
+
+
+
+class Inference:
+    def __init__(self):
+        print("loading LLM...")
+        t_start = time.time()
+
+        # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+        model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+        # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
+        # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
+        # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
+
+
+        quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",  # Recommended for better performance
+            bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
+            bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
+        )
+
+        quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
+
+        # Load the model with quantization (optional)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            # device_map="auto",  # Automatically places parts of the model on GPU/CPU
+            # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+            device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+            # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
+            quantization_config=quantization_config_8bit
+        )
+
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        print("load took %.3fs" % (time.time() - t_start))
+
+        max_context_length = self.model.config.max_position_embeddings
+
+
+        self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
+
+        print("max_context_length is %d tokens." % (max_context_length))
+
+
+    def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+        outputs = self.model.generate(
+            input_ids,  # **inputs, inputs["input_ids"]
+            max_new_tokens=500,  # max_length=max_context_length,
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            do_sample=True,
+            num_return_sequences=1
+        )
+        # skip all input tokens and only output the additional generated part of the conversation
+        input_token_count = len(input_ids[0])
+        out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
+        print(out_text)
+        return outputs, out_text
+
+
+
+    def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+        with torch.inference_mode():
+            return self._generate_incremental(input_ids)
+
+
+    def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+        # Start with the initial input tokens
+        generated_tokens = input_ids  # Initially, this is just the input tokens
+
+        n = 0
+        try:
+
+            # Loop to generate one token at a time
+            while True:
+                # Call the model with the current tokens
+                outputs = self.model(input_ids=generated_tokens, use_cache=True)
+
+                # Get the next token (the last token from the generated sequence)
+                next_token = outputs.logits.argmax(dim=-1)[:, -1]
+
+                # Append the new token to the sequence
+                generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
+
+                # Decode and print the newly generated token (skip special tokens)
+                out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
+                print(out_text, end="", flush=True)  # Print without newline
+
+                # Check if the generated token is the end-of-sequence token
+                if next_token.item() == self.tokenizer.eos_token_id:
+                    print("")
+                    break
+
+                n += 1
+                if n >= 15:
+                    n = 0
+                    torch.cuda.empty_cache()
+
+        except KeyboardInterrupt:
+            pass
+
+        # Once done, return the full generated sequence
+        input_token_count = len(input_ids[0])
+        full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
+
+        torch.cuda.empty_cache()
+
+        return generated_tokens, full_output
+
+
+    def tokenize(self, messages: list[dict], tokenize: bool) -> str | torch.Tensor:
+        if tokenize:
+            inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
+            inputs = {key: value.to(self.model.device) for key, value in inputs.items()}
+            return inputs["input_ids"]
+        else:
+            message = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
+            return message
+
+
+    def generate_tool_use_header(self, tools: list[callable]) -> str:
+        temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
+        s = self.tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
+        pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
+        match = re.search(pattern, s, re.DOTALL)
+        if not match:
+            raise Exception("Failed to regex match the template tool system text.")
+        extraction = match.group(1)
+        return extraction
+
+
+def torch_reseed(seed: int):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
--- a/llama.py
+++ b/llama.py
@@ -1,54 +1,55 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import time
-import torch
+# import torch
 import random
 from tool_helper import tool_list, parse_and_execute_tool_call
 from tool_functions import register_dummy
-import utils
-import re
+# import utils
+# import re
+from inference import Inference, torch_reseed

-t_start = time.time()
+# t_start = time.time()

-# model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
-model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
-# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
-# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
-# "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
+# # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+# model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+# # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
+# # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
+# # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model


-quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",  # Recommended for better performance
-    bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
-    bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
-)
+# quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
+#     load_in_4bit=True,
+#     bnb_4bit_quant_type="nf4",  # Recommended for better performance
+#     bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
+#     bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
+# )

-quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
+# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)

-# Load the model with quantization (optional)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    # device_map="auto",  # Automatically places parts of the model on GPU/CPU
-    # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
-    device_map="cuda",  # Automatically places parts of the model on GPU/CPU
-    # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
-    quantization_config=quantization_config_8bit
-)
+# # Load the model with quantization (optional)
+# model = AutoModelForCausalLM.from_pretrained(
+#     model_name,
+#     # device_map="auto",  # Automatically places parts of the model on GPU/CPU
+#     # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+#     device_map="cuda",  # Automatically places parts of the model on GPU/CPU
+#     # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
+#     quantization_config=quantization_config_8bit
+# )

-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+# # Load tokenizer
+# tokenizer = AutoTokenizer.from_pretrained(model_name)

-print("load took %.3fs" % (time.time() - t_start))
+# print("load took %.3fs" % (time.time() - t_start))

-max_context_length = model.config.max_position_embeddings
+# max_context_length = model.config.max_position_embeddings


-tokenizer.chat_template = utils.load_json_file("chat_template.json")
+# tokenizer.chat_template = utils.load_json_file("chat_template.json")




-print("max_context_length is %d tokens." % (max_context_length))
+# print("max_context_length is %d tokens." % (max_context_length))


 # Generate text
@@ -95,6 +96,8 @@ messages = [
    # {"role": "user", "content": "Hello, who are you?"}
 ]

+inference = None
+
 systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."

 roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
@@ -106,67 +109,67 @@ register_dummy()



-def generate_batch(inputs):
-    outputs = model.generate(
-        inputs["input_ids"],  # **inputs,
-        max_new_tokens=500,  # max_length=max_context_length,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        do_sample=True,
-        num_return_sequences=1
-    )
-    # skip all input tokens and only output the additional generated part of the conversation
-    input_token_count = len(inputs["input_ids"][0])
-    out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
-    print(out_text)
-    return outputs, out_text
+# def generate_batch(inputs):
+#     outputs = model.generate(
+#         inputs["input_ids"],  # **inputs,
+#         max_new_tokens=500,  # max_length=max_context_length,
+#         pad_token_id=tokenizer.pad_token_id,
+#         eos_token_id=tokenizer.eos_token_id,
+#         do_sample=True,
+#         num_return_sequences=1
+#     )
+#     # skip all input tokens and only output the additional generated part of the conversation
+#     input_token_count = len(inputs["input_ids"][0])
+#     out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
+#     print(out_text)
+#     return outputs, out_text



-def generate_incremental(inputs):
-    # Start with the initial input tokens
-    input_ids = inputs["input_ids"]
-    generated_tokens = input_ids  # Initially, this is just the input tokens
+# def generate_incremental(inputs):
+#     # Start with the initial input tokens
+#     input_ids = inputs["input_ids"]
+#     generated_tokens = input_ids  # Initially, this is just the input tokens

-    n = 0
-    try:
+#     n = 0
+#     try:

-        # Loop to generate one token at a time
-        while True:
-            # Call the model with the current tokens
-            outputs = model(input_ids=generated_tokens, use_cache=True)
+#         # Loop to generate one token at a time
+#         while True:
+#             # Call the model with the current tokens
+#             outputs = model(input_ids=generated_tokens, use_cache=True)

-            # Get the next token (the last token from the generated sequence)
-            next_token = outputs.logits.argmax(dim=-1)[:, -1]
+#             # Get the next token (the last token from the generated sequence)
+#             next_token = outputs.logits.argmax(dim=-1)[:, -1]

-            # Append the new token to the sequence
-            generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
+#             # Append the new token to the sequence
+#             generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)

-            # Decode and print the newly generated token (skip special tokens)
-            out_text = tokenizer.decode(next_token, skip_special_tokens=True)
-            print(out_text, end="", flush=True)  # Print without newline
+#             # Decode and print the newly generated token (skip special tokens)
+#             out_text = tokenizer.decode(next_token, skip_special_tokens=True)
+#             print(out_text, end="", flush=True)  # Print without newline

-            # Check if the generated token is the end-of-sequence token
-            if next_token.item() == tokenizer.eos_token_id:
-                print("")
-                break
+#             # Check if the generated token is the end-of-sequence token
+#             if next_token.item() == tokenizer.eos_token_id:
+#                 print("")
+#                 break

-            n += 1
-            if n >= 15:
-                n = 0
-                torch.cuda.empty_cache()
+#             n += 1
+#             if n >= 15:
+#                 n = 0
+#                 torch.cuda.empty_cache()

-    except KeyboardInterrupt:
-        pass
+#     except KeyboardInterrupt:
+#         pass


-    # Once done, return the full generated sequence
-    input_token_count = len(inputs["input_ids"][0])
-    full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
+#     # Once done, return the full generated sequence
+#     input_token_count = len(inputs["input_ids"][0])
+#     full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)

-    torch.cuda.empty_cache()
+#     torch.cuda.empty_cache()

-    return generated_tokens, full_output
+#     return generated_tokens, full_output


 def append_generate_chat(input_text: str, role="user"):
@@ -176,15 +179,16 @@ def append_generate_chat(input_text: str, role="user"):
    if input_text != None:
        messages.append({"role": role, "content": input_text})

-    # input_text = "Hello, who are you?"
-    # inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
-    inputs = {key: value.to(model.device) for key, value in inputs.items()}
-    # inputs = {key: value.to("cpu") for key, value in inputs.items()}
-    # inputs["input_ids"] = inputs["input_ids"][:, 1:]
+    # # input_text = "Hello, who are you?"
+    # # inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")
+    # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
+    # inputs = {key: value.to(model.device) for key, value in inputs.items()}
+    # # inputs = {key: value.to("cpu") for key, value in inputs.items()}
+    # # inputs["input_ids"] = inputs["input_ids"][:, 1:]

-    with torch.inference_mode():
-        outputs, out_text = generate_incremental(inputs)
+    inputs = inference.tokenize(messages, tokenize=True)
+
+    outputs, out_text = inference.generate_incremental(inputs)

    # append result to message history
    messages.append({"role": "assistant", "content": out_text})
@@ -202,21 +206,24 @@ def append_generate_chat(input_text: str, role="user"):



-def generate_tool_use_header(tools: list[callable]) -> str:
-    temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
-    s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
-    pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
-    match = re.search(pattern, s, re.DOTALL)
-    if not match:
-        raise Exception("Failed to regex match the template tool system text.")
-    extraction = match.group(1)
-    return extraction
+# def generate_tool_use_header(tools: list[callable]) -> str:
+#     temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
+#     s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
+#     pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
+#     match = re.search(pattern, s, re.DOTALL)
+#     if not match:
+#         raise Exception("Failed to regex match the template tool system text.")
+#     extraction = match.group(1)
+#     return extraction


 def main():
    global messages
+    global inference

-    messages = [{"role": "system", "content": systemmessage + "\n" + generate_tool_use_header(tool_list)}]
+    inference = Inference()
+
+    messages = [{"role": "system", "content": systemmessage + "\n" + inference.generate_tool_use_header(tool_list)}]

    while True:
        # print an input prompt to receive text or commands
@@ -235,7 +242,8 @@ def main():
            print("")

        elif input_text.startswith("/history"):
-            history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
+            history = inference.tokenize(messages, tokenize=False)
+            # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
            print(history)

        elif input_text.startswith("/undo"):
@@ -251,8 +259,7 @@ def main():
                print("regenerating message (not working)")
                messages = messages[:-1]
                seed = random.randint(0, 2**32 - 1)  # Generate a random seed
-                torch.manual_seed(seed)
-                torch.cuda.manual_seed_all(seed)
+                torch_reseed(seed)
                append_generate_chat(None)
            else:
                print("cannot regenerate because there are not enough messages on history.")
@@ -304,3 +311,7 @@ def main():
        else:
            append_generate_chat(input_text)

+
+
+if __name__ == "__main__":
+    main()