cleanup, tests

6 months ago · 71e5fa96f3
4 changed files with 136 additions and 169 deletions
--- a/llama.py
+++ b/llama.py
@ -1,101 +1,12 @@
 # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import time
 # import torch
 import random
 from tool_helper import tool_list, parse_and_execute_tool_call
 from tool_functions import register_dummy
 # import utils
 # import re
 from inference import Inference, torch_reseed
 # t_start = time.time()
 # # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
 # model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
 # # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
 # # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
 # # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
 # quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
 #     load_in_4bit=True,
 #     bnb_4bit_quant_type="nf4",  # Recommended for better performance
 #     bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
 #     bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
 # )
 # quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
 # # Load the model with quantization (optional)
 # model = AutoModelForCausalLM.from_pretrained(
 #     model_name,
 #     # device_map="auto",  # Automatically places parts of the model on GPU/CPU
 #     # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
 #     device_map="cuda",  # Automatically places parts of the model on GPU/CPU
 #     # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
 #     quantization_config=quantization_config_8bit
 # )
 # # Load tokenizer
 # tokenizer = AutoTokenizer.from_pretrained(model_name)
 # print("load took %.3fs" % (time.time() - t_start))
 # max_context_length = model.config.max_position_embeddings
 # tokenizer.chat_template = utils.load_json_file("chat_template.json")
 # print("max_context_length is %d tokens." % (max_context_length))
 # Generate text
 # schema = """
 # {
 #     "properties": {
 #         "program": {
 #             "description": "Python program to be executed. The Message response to the input query is the output of this program",
 #             "title": "Program",
 #             "type": "string"
 #         },
 #     },
 #     "required": [
 #         "program"
 #     ],
 #     "title": "Response",
 #     "type": "object"
 # }
 # """
 # """
 #         "confidence": {
 #             "description": "How sure you are the above message facts are true. Rate harsh from 0 to 1",
 #             "title": "Confidence",
 #             "type": "float"
 #         }
 #         """
 # tool_assist = """
 # You are a python assisted AI model. You may call the interpreter one or more times to assist with the user query. You might tell the user that you optained ground truth with the help of python or a calculator if asked about. The user is not able to see if python has been used, therefore do not expose and share failed attempts or syntax errors.
 # To invoke a this function, the answer may start and end with <python_tool_call> and </python_tool_call> respectively. The rest must be a valid python script, additional text is not allowed before and after. Calling python is not needed when just providing example code.
 # """
 messages = [
    # {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences."},
    # {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. It is %s now." % datetime.datetime.now().strftime("%Y-%m-%d %H:%M")},
    # {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. " + tool_assist},
    # {"role": "system", "content": "You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n%s\n</schema>" % schema},
    # {"role": "system", "content": "You are a helpful assistant that answers by entering commands into a python interpreter. The user only sees the stdout of your python input."},
    # {"role": "system", "content": "Make a summary of the below input prompt. Do not answer. The description should fit on 80 characters."},
    # {"role": "user", "content": "Hello, who are you?"}
 ]
 messages = []
 inference = None
 systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."
@ -109,68 +20,6 @@ register_dummy()
 # def generate_batch(inputs):
 #     outputs = model.generate(
 #         inputs["input_ids"],  # **inputs,
 #         max_new_tokens=500,  # max_length=max_context_length,
 #         pad_token_id=tokenizer.pad_token_id,
 #         eos_token_id=tokenizer.eos_token_id,
 #         do_sample=True,
 #         num_return_sequences=1
 #     )
 #     # skip all input tokens and only output the additional generated part of the conversation
 #     input_token_count = len(inputs["input_ids"][0])
 #     out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
 #     print(out_text)
 #     return outputs, out_text
 # def generate_incremental(inputs):
 #     # Start with the initial input tokens
 #     input_ids = inputs["input_ids"]
 #     generated_tokens = input_ids  # Initially, this is just the input tokens
 #     n = 0
 #     try:
 #         # Loop to generate one token at a time
 #         while True:
 #             # Call the model with the current tokens
 #             outputs = model(input_ids=generated_tokens, use_cache=True)
 #             # Get the next token (the last token from the generated sequence)
 #             next_token = outputs.logits.argmax(dim=-1)[:, -1]
 #             # Append the new token to the sequence
 #             generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
 #             # Decode and print the newly generated token (skip special tokens)
 #             out_text = tokenizer.decode(next_token, skip_special_tokens=True)
 #             print(out_text, end="", flush=True)  # Print without newline
 #             # Check if the generated token is the end-of-sequence token
 #             if next_token.item() == tokenizer.eos_token_id:
 #                 print("")
 #                 break
 #             n += 1
 #             if n >= 15:
 #                 n = 0
 #                 torch.cuda.empty_cache()
 #     except KeyboardInterrupt:
 #         pass
 #     # Once done, return the full generated sequence
 #     input_token_count = len(inputs["input_ids"][0])
 #     full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
 #     torch.cuda.empty_cache()
 #     return generated_tokens, full_output
 def append_generate_chat(input_text: str, role="user"):
    t_start = time.time()
@ -179,13 +28,6 @@ def append_generate_chat(input_text: str, role="user"):
    if input_text != None:
        messages.append({"role": role, "content": input_text})
    # # input_text = "Hello, who are you?"
    # # inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")
    # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True)  #continue_final_message=True,
    # inputs = {key: value.to(model.device) for key, value in inputs.items()}
    # # inputs = {key: value.to("cpu") for key, value in inputs.items()}
    # # inputs["input_ids"] = inputs["input_ids"][:, 1:]
    inputs = inference.tokenize(messages, tokenize=True)
    outputs, out_text = inference.generate_incremental(inputs)
@ -206,16 +48,6 @@ def append_generate_chat(input_text: str, role="user"):
 # def generate_tool_use_header(tools: list[callable]) -> str:
 #     temp_messages = [{}]  # for some reason an empty array is not allowed but a {} inside works like an empty array.
 #     s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
 #     pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
 #     match = re.search(pattern, s, re.DOTALL)
 #     if not match:
 #         raise Exception("Failed to regex match the template tool system text.")
 #     extraction = match.group(1)
 #     return extraction
 def main():
    global messages
--- a/tests/helper.py
+++ b/tests/helper.py
@ -1,7 +1,19 @@
 def tool_dummy(a: int, b: str):
    """
    tool_dummy
    Args:
        a: how much
        b: how text?
    """
    return "result_%d_%s" % (a, b)
 def tool_dummy2(text: str):
    """
    tool_dummy2
    Args:
        text: only text?
    """
    return text.upper()
--- a/tests/test_inference.py
+++ b/tests/test_inference.py
@ -0,0 +1,122 @@
 import pytest
 import tests.helper as helper
 inference = None
 InferenceClass = None
 Tensor = None
 def prepare():
    if InferenceClass == None:
        test_import_inference_module_librarys()
    if inference == None:
        test_instantiate_inference_instance()
 def test_import_inference_module_librarys():
    import inference
    import torch
    global InferenceClass
    global Tensor
    InferenceClass = inference.Inference
    Tensor = torch.Tensor
 def test_instantiate_inference_instance():
    if InferenceClass == None:
        test_import_inference_module_librarys()
    global inference
    inference = InferenceClass()
 def test_tool_header_generation():
    prepare()
    tools = [helper.tool_dummy, helper.tool_dummy2]
    header = inference.generate_tool_use_header(tools)
    assert len(header) > 100
 def test_tokenize_dummy():
    prepare()
    system_message = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."
    user_message = "say 'Hello World!'"
    assistant_message = "Hello World!"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": assistant_message}
    ]
    history = inference.tokenize(messages, tokenize=False)
    assert type(history) is str
    assert history.find(system_message) != -1
    assert history.find(user_message) != -1
    assert history.find(assistant_message) != -1
 def test_tokenize_tensor():
    prepare()
    system_message = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."
    user_message = "say 'Hello World!'"
    assistant_message = "Hello World!"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": assistant_message}
    ]
    history = inference.tokenize(messages, tokenize=True)
    assert type(history) is Tensor
    assert len(history[0]) >= len(str(messages).split(" "))
 def test_inference():
    prepare()
    system_message = "Pretend you are a Python console."
    user_message = "print('Hello World!')"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    input_ids = inference.tokenize(messages, tokenize=True)
    assert type(input_ids) is Tensor
    assert len(input_ids[0]) >= len(str(messages).split(" "))
    generated_tokens, full_output = inference.generate_incremental(input_ids)
    assert type(generated_tokens) is Tensor
    assert len(generated_tokens[0]) > 2
    assert type(full_output) is str
    # assert full_output.find("Hello World!") >= 0
 def test_inference_2():
    prepare()
    system_message = "Pretend you are a Python console."
    user_message = "print('Hello World!')"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    input_ids = inference.tokenize(messages, tokenize=True)
    assert type(input_ids) is Tensor
    assert len(input_ids[0]) >= len(str(messages).split(" "))
    generated_tokens, full_output = inference.generate_batch(input_ids)
    assert type(generated_tokens) is Tensor
    assert len(generated_tokens[0]) > 2
    assert type(full_output) is str
    # assert full_output.find("Hello World!") >= 0
--- a/tests/test_tool_function_decorator.py
+++ b/tests/test_tool_function_decorator.py
@ -5,6 +5,7 @@ import tests.helper as helper
 def test_tool_function_decorator():
    """ @tool """
    # get length before adding tools
    start_len = len(tool_helper.tool_list)