first tests

6 months ago · f46ac3a4c1
3 changed files with 43 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 /model/*
 *.prof
--- a/llama.py
+++ b/llama.py
@ -0,0 +1,38 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import time
 t_start = time.time()
 # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
 model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
 # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
 # Load the model with quantization (optional)
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # device_map="auto",  # Automatically places parts of the model on GPU/CPU
    device_map="cpu",  # Automatically places parts of the model on GPU/CPU
    load_in_8bit=False,   # Enables 8-bit quantization if bitsandbytes is installed
 )
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print("load took %.3fs" % (time.time() - t_start))
 t_start = time.time()
 # Generate text
 input_text = "Hello, who are you?"
 inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # .to("cuda") .to("cpu")
 outputs = model.generate(
    inputs["input_ids"],
    # max_length=200,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
 )
 # Decode and print result
 print(tokenizer.decode(outputs[0], skip_special_tokens=False))
 print("genaration took %.3fs" % (time.time() - t_start))
 t_start = time.time()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 transformers
 accelerate
 bitsandbytes