commit f46ac3a4c1a6a1fe06b23fbf31764f5e72bb82e4 Author: Florin Tobler Date: Tue Dec 31 16:23:42 2024 +0100 first tests diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cb1f61f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/model/* +*.prof \ No newline at end of file diff --git a/llama.py b/llama.py new file mode 100644 index 0000000..a3ae422 --- /dev/null +++ b/llama.py @@ -0,0 +1,38 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer +import time + +t_start = time.time() + +# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub +model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub +# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model + +# Load the model with quantization (optional) +model = AutoModelForCausalLM.from_pretrained( + model_name, + # device_map="auto", # Automatically places parts of the model on GPU/CPU + device_map="cpu", # Automatically places parts of the model on GPU/CPU + load_in_8bit=False, # Enables 8-bit quantization if bitsandbytes is installed +) + +# Load tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_name) + +print("load took %.3fs" % (time.time() - t_start)) +t_start = time.time() + +# Generate text +input_text = "Hello, who are you?" +inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu") +outputs = model.generate( + inputs["input_ids"], + # max_length=200, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id +) + +# Decode and print result +print(tokenizer.decode(outputs[0], skip_special_tokens=False)) + +print("genaration took %.3fs" % (time.time() - t_start)) +t_start = time.time() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d301274 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +transformers +accelerate +bitsandbytes \ No newline at end of file