llama/llama.py


								import time

								import random

								from tool_helper import tool_list, parse_and_execute_tool_call

								from tool_functions import register_dummy

								from inference import Inference, torch_reseed

								import datetime

								from modelconfig import Modelconfig


								messages = []

								inference = None


								# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions

								systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."

								systemmessage = "Hold a casual conversation with the user. Answer using markdown to the user."


								# system message for role flip so the model automatically answers for the user

								roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}


								# system messages and user message to bring the model to summarize the entire conversation

								summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."}

								summarize_user = {"role": "system", "content": "Can you summarize the conversation?"}


								# system message to create a conversation title

								title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}


								append_toolcalls = False


								register_dummy()


								def append_generate_chat(input_text: str, role="user"):

								    t_start = time.time()


								    # generate AI response

								    if input_text != None:

								        messages.append({"role": role, "content": input_text})


								    inputs = inference.tokenize(messages, tokenize=True)

								    number_of_input_tokens = inputs.shape[1]


								    outputs, out_text = inference.generate(inputs)


								    # append result to message history

								    messages.append({"role": "assistant", "content": out_text})


								    print("")

								    time_taken = time.time() - t_start

								    number_of_tokens = len(outputs[0])

								    tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken

								    print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))


								    # handle tool call and check if a tool call has happened.

								    tool_result = parse_and_execute_tool_call(out_text, tool_list)

								    if tool_result != None:

								        # tool call happened

								        tool_result = "<tool_response>%s</tool_response>" % tool_result

								        # depending on the chat template the tool response tags must or must not be passed. :(

								        append_generate_chat(tool_result, role="tool")


								def terminal_generation_loop():

								    global messages

								    global inference


								    while True:

								        # print an input prompt to receive text or commands

								        input_text = input(">>> ")

								        print("")


								        if input_text.startswith("!"):

								            append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")

								            # append_generate_chat("%s" % input_text[1:], role="tool")  # depending on the chat template the tool response tags must or must not be passed. :(


								        elif input_text.startswith("/clear"):

								            print("clearing chat history")

								            start_msg = messages[0]

								            messages = [start_msg]

								            print("")


								        elif input_text.startswith("/history"):

								            history = inference.tokenize(messages, tokenize=False)

								            # history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)

								            print(history)


								        elif input_text.startswith("/undo"):

								            if len(messages) > 2:

								                print("undo latest prompt")

								                messages = messages[:-2]

								            else:

								                print("cannot undo because there are not enough messages on history.")

								            print("")


								        elif input_text.startswith("/regen"):

								            if len(messages) >= 2:

								                print("regenerating message (not working)")

								                messages = messages[:-1]

								                seed = random.randint(0, 2**32 - 1)  # Generate a random seed

								                torch_reseed(seed)

								                append_generate_chat(None)

								            else:

								                print("cannot regenerate because there are not enough messages on history.")

								            print("")


								        elif input_text.startswith("/more"):

								            append_generate_chat(None)


								        elif input_text.startswith("/file"):

								            filename = input_text[len("/file "):]

								            print("read '%s' for prompt:" % filename)

								            with open(filename, "r") as f:

								                content = f.read()

								            print(content)

								            append_generate_chat(content)


								        elif input_text.startswith("/auto"):

								            messages_backup = messages

								            messages = [roleflip]

								            for m in messages_backup:

								                role = m["role"]

								                content = m["content"]

								                if role == "user":

								                    role = "assistant"

								                elif role == "assistant":

								                    role = "user"

								                if role != "system":

								                    messages.append({"role": role, "content": content})

								            append_generate_chat(None)  # will automatically advance the conversation as 'user'

								            last_message = messages[-1]

								            last_message["role"] = "user"

								            messages = messages_backup + [last_message]

								            append_generate_chat(None)  # 'regular' chatbot answer


								        elif input_text.startswith("/summarize"):

								            messages_temp = list(filter(lambda x: x["role"] != "system", messages))

								            messages_temp = [summarize] + messages_temp + [summarize_user]  # copy dict in last instance

								            # messages_temp[-1]["role"] = "user"

								            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")

								            generated_tokens, full_output = inference.generate(input_ids)


								        elif input_text.startswith("/title"):

								            messages_temp = list(filter(lambda x: x["role"] != "system", messages))

								            messages_temp = [title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance

								            messages_temp[-1]["role"] = "user"

								            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")

								            generated_tokens, full_output = inference.generate(input_ids)


								        elif input_text.startswith("/help"):

								            print("!<prompt>   answer as 'tool' in <tool_response> tags")

								            print("/clear      clear chat history")

								            print("/undo       undo latest prompt")

								            print("/regen      regenerate the last message")

								            print("/more       generate more additional information")

								            print("/file       read prompt input from file")

								            print("/auto       automatically advance conversation")

								            print("/summarize  generate a summary of the chat")

								            print("/title      generate a title of the chat")

								            print("/help       print this message")

								            print("")


								        elif input_text.startswith("/"):

								            print("unknown command.")


								        else:

								            append_generate_chat(input_text)


								def main():

								    global messages

								    global inference


								    # model: NousResearch/Hermes-3-Llama-3.2-3B

								    # tokens: 315 tk

								    # time: 94.360 s

								    # speed: 3.338 tk/s

								    # vram_bulk: 3622 MB

								    # vram_top: 80 MB

								    # context: 131072 tk

								    # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)


								    # model: unsloth/Llama-3.2-1B

								    # tokens: 589 tk

								    # time: 39.348 s

								    # speed: 14.969 tk/s

								    # vram_bulk: 4708 MB

								    # vram_top: 102 MB

								    # context: 131072 tk

								    # model = Modelconfig("unsloth/Llama-3.2-1B")  # note, fast, but talks to itself. basically does not work.


								    # model: unsloth/Llama-3.2-3B-Instruct

								    # tokens: 285 tk

								    # time: 75.363 s

								    # speed: 3.782 tk/s

								    # vram_bulk: 3512 MB

								    # vram_top: 48 MB

								    # context: 131072 tk

								    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)


								    # model: unsloth/llama-3-8b-bnb-4bit

								    # tokens: 435 tk

								    # time: 84.314 s

								    # speed: 5.159 tk/s

								    # vram_bulk: 5440 MB

								    # vram_top: 216 MB

								    # context: 8192 tk

								    # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")


								    # Model size: 3.21B params

								    # vram used: xxxxx MB

								    # speed xxxxx t/s

								    # working: DOES NOT LOAD

								    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)


								    # model: unsloth/gemma-2-9b-it-bnb-4bit

								    # tokens: 154 tk

								    # time: 32.727 s

								    # speed: 4.706 tk/s

								    # vram_bulk: 6156 MB

								    # vram_top: 232 MB

								    # context: 8192 tk

								    # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")


								    # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit

								    # tokens: 120 tk

								    # time: 12.248 s

								    # speed: 9.798 tk/s

								    # vram_bulk: 5382 MB

								    # vram_top: 170 MB

								    # context: 32768 tk

								    model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")  # note, this works really good


								    # model: unsloth/Qwen2.5-3B-Instruct

								    # tokens: 112 tk

								    # time: 12.703 s

								    # speed: 8.816 tk/s

								    # vram_bulk: 2108 MB

								    # vram_top: 98 MB

								    # context: 32768 tk

								    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)


								    # model: unsloth/Qwen2.5-3B-Instruct

								    # tokens: 118 tk

								    # time: 33.748 s

								    # speed: 3.497 tk/s

								    # vram_bulk: 3310 MB

								    # vram_top: 60 MB

								    # context: 32768 tk

								    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)


								    # Model size: 3.87B params

								    # vram used: xxxxx MB

								    # speed xxxxx t/s

								    # error: requires the protobuf library but it was not found in your environment

								    # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")


								    inference = Inference(model)


								    current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")

								    if append_toolcalls:

								        messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]

								    else:

								        messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time}]


								    terminal_generation_loop()


								if __name__ == "__main__":

								    main()