toolcalls with hermes-3

2024-12-31 17:09:36 +01:00
parent 126f4a3fad
commit 823f13ab51
1 changed files with 23 additions and 22 deletions
--- a/llama.py
+++ b/llama.py
@@ -14,7 +14,7 @@ model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftob
 # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model


-quantization_config_4bit = BitsAndBytesConfig(
+quantization_config_4bit = BitsAndBytesConfig(  # tool calls don't really work in 4 bit mode
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Recommended for better performance
    bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
@@ -30,7 +30,7 @@ model = AutoModelForCausalLM.from_pretrained(
    # device_map="cuda",  # Automatically places parts of the model on GPU/CPU
    device_map="cuda",  # Automatically places parts of the model on GPU/CPU
    # load_in_8bit=True,   # Enables 8-bit quantization if bitsandbytes is installed
-    quantization_config=quantization_config_4bit
+    quantization_config=quantization_config_8bit
 )

 # Load tokenizer
@@ -52,23 +52,23 @@ print("max_context_length is %d tokens." % (max_context_length))

 # Generate text

-schema = """
-{
-    "properties": {
-        "program": {
-            "description": "Python program to be executed. The Message response to the input query is the output of this program",
-            "title": "Program",
-            "type": "string"
-        },
+# schema = """
+# {
+#     "properties": {
+#         "program": {
+#             "description": "Python program to be executed. The Message response to the input query is the output of this program",
+#             "title": "Program",
+#             "type": "string"
+#         },

-    },
-    "required": [
-        "program"
-    ],
-    "title": "Response",
-    "type": "object"
-}
-"""
+#     },
+#     "required": [
+#         "program"
+#     ],
+#     "title": "Response",
+#     "type": "object"
+# }
+# """


 # """
@@ -79,10 +79,10 @@ schema = """
 #         }
 #         """

-tool_assist = """
-You are a python assisted AI model. You may call the interpreter one or more times to assist with the user query. You might tell the user that you optained ground truth with the help of python or a calculator if asked about. The user is not able to see if python has been used, therefore do not expose and share failed attempts or syntax errors.
-To invoke a this function, the answer may start and end with <python_tool_call> and </python_tool_call> respectively. The rest must be a valid python script, additional text is not allowed before and after. Calling python is not needed when just providing example code.
-"""
+# tool_assist = """
+# You are a python assisted AI model. You may call the interpreter one or more times to assist with the user query. You might tell the user that you optained ground truth with the help of python or a calculator if asked about. The user is not able to see if python has been used, therefore do not expose and share failed attempts or syntax errors.
+# To invoke a this function, the answer may start and end with <python_tool_call> and </python_tool_call> respectively. The rest must be a valid python script, additional text is not allowed before and after. Calling python is not needed when just providing example code.
+# """

 messages = [
    # {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences."},
@@ -187,6 +187,7 @@ def append_generate_chat(input_text: str, role="user"):
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True, tools=tool_functions)  #continue_final_message=True,
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    # inputs = {key: value.to("cpu") for key, value in inputs.items()}
+    # inputs["input_ids"] = inputs["input_ids"][:, 1:]

    with torch.inference_mode():
        outputs, out_text = generate_incremental(inputs)