5 changed files with 288 additions and 47 deletions
@ -0,0 +1,37 @@ |
|||||
|
|
||||
|
|
||||
|
from inference import Inference |
||||
|
from modelconfig import Modelconfig |
||||
|
|
||||
|
|
||||
|
def main(): |
||||
|
# Model size: 3.21B params |
||||
|
Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)) |
||||
|
|
||||
|
# Model size: 1.24B params |
||||
|
Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True)) |
||||
|
|
||||
|
# Model size: 3.21B params |
||||
|
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)) |
||||
|
|
||||
|
# Model size: 4.65B params |
||||
|
Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 3.21B params |
||||
|
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 5.21B params |
||||
|
Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 4.46B params |
||||
|
Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 3.09B params |
||||
|
Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)) |
||||
|
|
||||
|
# Model size: 3.87B params |
||||
|
Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True)) |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
main() |
@ -0,0 +1,76 @@ |
|||||
|
from inference import Inference |
||||
|
from modelconfig import Modelconfig |
||||
|
import time |
||||
|
import nvidia_smi |
||||
|
import torch |
||||
|
import gc |
||||
|
|
||||
|
|
||||
|
def empty_cuda(): |
||||
|
while True: |
||||
|
gc.collect() |
||||
|
torch.cuda.empty_cache() |
||||
|
time.sleep(0.5) |
||||
|
vram = nvidia_smi.get_gpu_stats()["memory_used"] |
||||
|
print("vram: %d MB" % vram) |
||||
|
if vram < 200: |
||||
|
return |
||||
|
|
||||
|
|
||||
|
def profile_ex(model_conf: Modelconfig): |
||||
|
print("") |
||||
|
empty_cuda() |
||||
|
messages = [ |
||||
|
{"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."}, |
||||
|
{"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"}, |
||||
|
] |
||||
|
|
||||
|
gpu_stats_before = nvidia_smi.get_gpu_stats() |
||||
|
inference = Inference(model_conf) |
||||
|
|
||||
|
gpu_stats_loaded = nvidia_smi.get_gpu_stats() |
||||
|
t_start = time.time() |
||||
|
input_ids = inference.tokenize(messages, tokenize=True) |
||||
|
generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False) |
||||
|
t_end = time.time() |
||||
|
gpu_stats_after = nvidia_smi.get_gpu_stats() |
||||
|
|
||||
|
took = t_end - t_start |
||||
|
tokens = len(generated_tokens[0]) |
||||
|
tokens_per = tokens / took |
||||
|
vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"] |
||||
|
vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"] |
||||
|
print("model: %s" % model_conf.model_name) |
||||
|
print("tokens: %d tk" % tokens) |
||||
|
print("time: %.3f s" % took) |
||||
|
print("speed: %.3f tk/s" % tokens_per) |
||||
|
print("vram_bulk: %d MB" % vram_bulk) |
||||
|
print("vram_top: %d MB" % vram_top) |
||||
|
print("context: %d tk" % inference.max_context_length) |
||||
|
print("") |
||||
|
|
||||
|
|
||||
|
def profile(model_conf): |
||||
|
try: |
||||
|
profile_ex(model_conf) |
||||
|
except Exception as e: |
||||
|
print("exception: " + str(e)) |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
def main(): |
||||
|
profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/Llama-3.2-1B")) |
||||
|
profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit")) |
||||
|
# profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")) |
||||
|
profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")) |
||||
|
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)) |
||||
|
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)) |
||||
|
profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")) |
||||
|
|
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
main() |
@ -0,0 +1,20 @@ |
|||||
|
|
||||
|
from transformers import BitsAndBytesConfig |
||||
|
import torch |
||||
|
|
||||
|
class Modelconfig: |
||||
|
def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False): |
||||
|
self.model_name = model_name |
||||
|
if load_in_4bit: |
||||
|
assert bits_and_bytes_config == None |
||||
|
self.bits_and_bytes_config = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode |
||||
|
load_in_4bit=True, |
||||
|
bnb_4bit_quant_type="nf4", # Recommended for better performance |
||||
|
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving |
||||
|
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation |
||||
|
) |
||||
|
elif load_in_8bit: |
||||
|
assert bits_and_bytes_config == None |
||||
|
self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True) |
||||
|
else: |
||||
|
self.bits_and_bytes_config = bits_and_bytes_config |
Loading…
Reference in new issue