from transformers import AutoModelForCausalLM, AutoTokenizer import time t_start = time.time() # model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub # "meta-llama/Llama-2-7b-hf" # Replace with your chosen model # Load the model with quantization (optional) model = AutoModelForCausalLM.from_pretrained( model_name, # device_map="auto", # Automatically places parts of the model on GPU/CPU device_map="cpu", # Automatically places parts of the model on GPU/CPU load_in_8bit=False, # Enables 8-bit quantization if bitsandbytes is installed ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) print("load took %.3fs" % (time.time() - t_start)) t_start = time.time() # Generate text input_text = "Hello, who are you?" inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu") outputs = model.generate( inputs["input_ids"], # max_length=200, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) # Decode and print result print(tokenizer.decode(outputs[0], skip_special_tokens=False)) print("genaration took %.3fs" % (time.time() - t_start)) t_start = time.time()