toolcalls with hermes-3
This commit is contained in:
45
llama.py
45
llama.py
@@ -14,7 +14,7 @@ model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftob
|
||||
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
||||
|
||||
|
||||
quantization_config_4bit = BitsAndBytesConfig(
|
||||
quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||
@@ -30,7 +30,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
||||
quantization_config=quantization_config_4bit
|
||||
quantization_config=quantization_config_8bit
|
||||
)
|
||||
|
||||
# Load tokenizer
|
||||
@@ -52,23 +52,23 @@ print("max_context_length is %d tokens." % (max_context_length))
|
||||
|
||||
# Generate text
|
||||
|
||||
schema = """
|
||||
{
|
||||
"properties": {
|
||||
"program": {
|
||||
"description": "Python program to be executed. The Message response to the input query is the output of this program",
|
||||
"title": "Program",
|
||||
"type": "string"
|
||||
},
|
||||
# schema = """
|
||||
# {
|
||||
# "properties": {
|
||||
# "program": {
|
||||
# "description": "Python program to be executed. The Message response to the input query is the output of this program",
|
||||
# "title": "Program",
|
||||
# "type": "string"
|
||||
# },
|
||||
|
||||
},
|
||||
"required": [
|
||||
"program"
|
||||
],
|
||||
"title": "Response",
|
||||
"type": "object"
|
||||
}
|
||||
"""
|
||||
# },
|
||||
# "required": [
|
||||
# "program"
|
||||
# ],
|
||||
# "title": "Response",
|
||||
# "type": "object"
|
||||
# }
|
||||
# """
|
||||
|
||||
|
||||
# """
|
||||
@@ -79,10 +79,10 @@ schema = """
|
||||
# }
|
||||
# """
|
||||
|
||||
tool_assist = """
|
||||
You are a python assisted AI model. You may call the interpreter one or more times to assist with the user query. You might tell the user that you optained ground truth with the help of python or a calculator if asked about. The user is not able to see if python has been used, therefore do not expose and share failed attempts or syntax errors.
|
||||
To invoke a this function, the answer may start and end with <python_tool_call> and </python_tool_call> respectively. The rest must be a valid python script, additional text is not allowed before and after. Calling python is not needed when just providing example code.
|
||||
"""
|
||||
# tool_assist = """
|
||||
# You are a python assisted AI model. You may call the interpreter one or more times to assist with the user query. You might tell the user that you optained ground truth with the help of python or a calculator if asked about. The user is not able to see if python has been used, therefore do not expose and share failed attempts or syntax errors.
|
||||
# To invoke a this function, the answer may start and end with <python_tool_call> and </python_tool_call> respectively. The rest must be a valid python script, additional text is not allowed before and after. Calling python is not needed when just providing example code.
|
||||
# """
|
||||
|
||||
messages = [
|
||||
# {"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences."},
|
||||
@@ -187,6 +187,7 @@ def append_generate_chat(input_text: str, role="user"):
|
||||
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True, tools=tool_functions) #continue_final_message=True,
|
||||
inputs = {key: value.to(model.device) for key, value in inputs.items()}
|
||||
# inputs = {key: value.to("cpu") for key, value in inputs.items()}
|
||||
# inputs["input_ids"] = inputs["input_ids"][:, 1:]
|
||||
|
||||
with torch.inference_mode():
|
||||
outputs, out_text = generate_incremental(inputs)
|
||||
|
Reference in New Issue
Block a user