llama/llama.py


								from transformers import AutoModelForCausalLM, AutoTokenizer

								import time


								t_start = time.time()


								# model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub

								model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub

								# "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model


								# Load the model with quantization (optional)

								model = AutoModelForCausalLM.from_pretrained(

								    model_name,

								    # device_map="auto",  # Automatically places parts of the model on GPU/CPU

								    device_map="cpu",  # Automatically places parts of the model on GPU/CPU

								    load_in_8bit=False,   # Enables 8-bit quantization if bitsandbytes is installed

								)


								# Load tokenizer

								tokenizer = AutoTokenizer.from_pretrained(model_name)


								print("load took %.3fs" % (time.time() - t_start))

								t_start = time.time()


								# Generate text

								input_text = "Hello, who are you?"

								inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")

								outputs = model.generate(

								    inputs["input_ids"],

								    # max_length=200,

								    pad_token_id=tokenizer.pad_token_id,

								    eos_token_id=tokenizer.eos_token_id

								)


								# Decode and print result

								print(tokenizer.decode(outputs[0], skip_special_tokens=False))


								print("genaration took %.3fs" % (time.time() - t_start))

								t_start = time.time()