commit
f46ac3a4c1
3 changed files with 43 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||
/model/* |
|||
*.prof |
@ -0,0 +1,38 @@ |
|||
from transformers import AutoModelForCausalLM, AutoTokenizer |
|||
import time |
|||
|
|||
t_start = time.time() |
|||
|
|||
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub |
|||
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub |
|||
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model |
|||
|
|||
# Load the model with quantization (optional) |
|||
model = AutoModelForCausalLM.from_pretrained( |
|||
model_name, |
|||
# device_map="auto", # Automatically places parts of the model on GPU/CPU |
|||
device_map="cpu", # Automatically places parts of the model on GPU/CPU |
|||
load_in_8bit=False, # Enables 8-bit quantization if bitsandbytes is installed |
|||
) |
|||
|
|||
# Load tokenizer |
|||
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|||
|
|||
print("load took %.3fs" % (time.time() - t_start)) |
|||
t_start = time.time() |
|||
|
|||
# Generate text |
|||
input_text = "Hello, who are you?" |
|||
inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu") |
|||
outputs = model.generate( |
|||
inputs["input_ids"], |
|||
# max_length=200, |
|||
pad_token_id=tokenizer.pad_token_id, |
|||
eos_token_id=tokenizer.eos_token_id |
|||
) |
|||
|
|||
# Decode and print result |
|||
print(tokenizer.decode(outputs[0], skip_special_tokens=False)) |
|||
|
|||
print("genaration took %.3fs" % (time.time() - t_start)) |
|||
t_start = time.time() |
@ -0,0 +1,3 @@ |
|||
transformers |
|||
accelerate |
|||
bitsandbytes |
Loading…
Reference in new issue