Browse Source

first tests

master
Florin Tobler 6 months ago
commit
f46ac3a4c1
  1. 2
      .gitignore
  2. 38
      llama.py
  3. 3
      requirements.txt

2
.gitignore

@ -0,0 +1,2 @@
/model/*
*.prof

38
llama.py

@ -0,0 +1,38 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
t_start = time.time()
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
# Load the model with quantization (optional)
model = AutoModelForCausalLM.from_pretrained(
model_name,
# device_map="auto", # Automatically places parts of the model on GPU/CPU
device_map="cpu", # Automatically places parts of the model on GPU/CPU
load_in_8bit=False, # Enables 8-bit quantization if bitsandbytes is installed
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("load took %.3fs" % (time.time() - t_start))
t_start = time.time()
# Generate text
input_text = "Hello, who are you?"
inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu")
outputs = model.generate(
inputs["input_ids"],
# max_length=200,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode and print result
print(tokenizer.decode(outputs[0], skip_special_tokens=False))
print("genaration took %.3fs" % (time.time() - t_start))
t_start = time.time()

3
requirements.txt

@ -0,0 +1,3 @@
transformers
accelerate
bitsandbytes
Loading…
Cancel
Save