llama/chatbug/model_selection.py



								from chatbug.modelconfig import Modelconfig


								def get_model() -> Modelconfig:


								    # model: NousResearch/Hermes-3-Llama-3.2-3B

								    # tokens: 315 tk

								    # time: 94.360 s

								    # speed: 3.338 tk/s

								    # vram_bulk: 3622 MB

								    # vram_top: 80 MB

								    # context: 131072 tk

								    # model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)


								    # model: unsloth/Llama-3.2-1B

								    # tokens: 589 tk

								    # time: 39.348 s

								    # speed: 14.969 tk/s

								    # vram_bulk: 4708 MB

								    # vram_top: 102 MB

								    # context: 131072 tk

								    # model = Modelconfig("unsloth/Llama-3.2-1B")  # note, fast, but talks to itself. basically does not work.


								    # model: unsloth/Llama-3.2-3B-Instruct

								    # tokens: 285 tk

								    # time: 75.363 s

								    # speed: 3.782 tk/s

								    # vram_bulk: 3512 MB

								    # vram_top: 48 MB

								    # context: 131072 tk

								    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)


								    # model: unsloth/llama-3-8b-bnb-4bit

								    # tokens: 435 tk

								    # time: 84.314 s

								    # speed: 5.159 tk/s

								    # vram_bulk: 5440 MB

								    # vram_top: 216 MB

								    # context: 8192 tk

								    # model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")


								    # Model size: 3.21B params

								    # vram used: xxxxx MB

								    # speed xxxxx t/s

								    # working: DOES NOT LOAD

								    # model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)


								    # model: unsloth/gemma-2-9b-it-bnb-4bit

								    # tokens: 154 tk

								    # time: 32.727 s

								    # speed: 4.706 tk/s

								    # vram_bulk: 6156 MB

								    # vram_top: 232 MB

								    # context: 8192 tk

								    # model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")


								    # model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit

								    # tokens: 120 tk

								    # time: 12.248 s

								    # speed: 9.798 tk/s

								    # vram_bulk: 5382 MB

								    # vram_top: 170 MB

								    # context: 32768 tk

								    model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")  # note, this works really good


								    # model: unsloth/Qwen2.5-3B-Instruct

								    # tokens: 112 tk

								    # time: 12.703 s

								    # speed: 8.816 tk/s

								    # vram_bulk: 2108 MB

								    # vram_top: 98 MB

								    # context: 32768 tk

								    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)


								    # model: unsloth/Qwen2.5-3B-Instruct

								    # tokens: 118 tk

								    # time: 33.748 s

								    # speed: 3.497 tk/s

								    # vram_bulk: 3310 MB

								    # vram_top: 60 MB

								    # context: 32768 tk

								    # model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)


								    # Model size: 3.87B params

								    # vram used: xxxxx MB

								    # speed xxxxx t/s

								    # error: requires the protobuf library but it was not found in your environment

								    # model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")


								    return model