|
|
@ -4,6 +4,14 @@ if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
|
|
from transformers.cache_utils import ( |
|
|
|
DynamicCache, |
|
|
|
SinkCache, |
|
|
|
StaticCache, |
|
|
|
SlidingWindowCache, |
|
|
|
QuantoQuantizedCache, |
|
|
|
QuantizedCacheConfig, |
|
|
|
) |
|
|
|
import torch |
|
|
|
import time |
|
|
|
import utils |
|
|
@ -20,6 +28,7 @@ class Inference: |
|
|
|
|
|
|
|
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub |
|
|
|
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub |
|
|
|
# model_name = "gpt2" |
|
|
|
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B" |
|
|
|
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" |
|
|
|
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model |
|
|
@ -44,6 +53,11 @@ class Inference: |
|
|
|
quantization_config=quantization_config_8bit |
|
|
|
) |
|
|
|
|
|
|
|
# print("apply optimization") |
|
|
|
# self.model.generation_config.cache_implementation = "static" |
|
|
|
# self.model.forward = torch.compile(self.model.forward, mode="reduce-overhead", fullgraph=True) |
|
|
|
|
|
|
|
|
|
|
|
# Load tokenizer |
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
@ -57,6 +71,11 @@ class Inference: |
|
|
|
print("max_context_length is %d tokens." % (max_context_length)) |
|
|
|
|
|
|
|
|
|
|
|
def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: |
|
|
|
with torch.inference_mode(): |
|
|
|
return self.generate_incremental_2(input_ids) |
|
|
|
|
|
|
|
|
|
|
|
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: |
|
|
|
outputs = self.model.generate( |
|
|
|
input_ids, # **inputs, inputs["input_ids"] |
|
|
@ -64,14 +83,72 @@ class Inference: |
|
|
|
pad_token_id=self.tokenizer.pad_token_id, |
|
|
|
eos_token_id=self.tokenizer.eos_token_id, |
|
|
|
do_sample=True, |
|
|
|
num_return_sequences=1 |
|
|
|
num_return_sequences=1, |
|
|
|
num_beams = 1 |
|
|
|
) |
|
|
|
# skip all input tokens and only output the additional generated part of the conversation |
|
|
|
input_token_count = len(input_ids[0]) |
|
|
|
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True) |
|
|
|
print(out_text) |
|
|
|
return outputs, out_text |
|
|
|
|
|
|
|
|
|
|
|
def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: |
|
|
|
generated_tokens = input_ids |
|
|
|
|
|
|
|
# past_key_values = DynamicCache() |
|
|
|
past_key_values = StaticCache(config=self.model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16) |
|
|
|
|
|
|
|
# n = 0 |
|
|
|
try: |
|
|
|
while True: |
|
|
|
outputs = self.model.generate( |
|
|
|
generated_tokens, # **inputs, inputs["input_ids"] |
|
|
|
max_new_tokens=10, # like streaming |
|
|
|
pad_token_id=self.tokenizer.pad_token_id, |
|
|
|
eos_token_id=self.tokenizer.eos_token_id, |
|
|
|
do_sample=True, |
|
|
|
num_return_sequences=1, |
|
|
|
num_beams = 1, |
|
|
|
use_cache=True, |
|
|
|
past_key_values=past_key_values |
|
|
|
) |
|
|
|
# past_key_values = outputs.past_key_values |
|
|
|
|
|
|
|
# Get the next token (the last token from the generated sequence) |
|
|
|
# next_token = outputs.argmax(dim=-1)[:, -1] |
|
|
|
new_tokens = outputs[0, len(generated_tokens[0]):] |
|
|
|
# next_token = outputs[0,-1] |
|
|
|
|
|
|
|
# Append the new token to the sequence |
|
|
|
generated_tokens = outputs |
|
|
|
# generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0).unsqueeze(0)], dim=1) |
|
|
|
|
|
|
|
# Decode and print the newly generated token (skip special tokens) |
|
|
|
# out_text = self.tokenizer.decode(next_token, skip_special_tokens=True) |
|
|
|
out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True) |
|
|
|
print(out_text, end="", flush=True) # Print without newline |
|
|
|
|
|
|
|
# Check if the generated token is the end-of-sequence token |
|
|
|
# if next_token.item() == self.tokenizer.eos_token_id: |
|
|
|
if new_tokens[-1].item() == self.tokenizer.eos_token_id: |
|
|
|
print("") |
|
|
|
break |
|
|
|
|
|
|
|
# n += 1 |
|
|
|
# if n >= 15: |
|
|
|
# n = 0 |
|
|
|
# torch.cuda.empty_cache() |
|
|
|
except KeyboardInterrupt: |
|
|
|
pass |
|
|
|
|
|
|
|
# Once done, return the full generated sequence |
|
|
|
input_token_count = len(input_ids[0]) |
|
|
|
full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True) |
|
|
|
|
|
|
|
# torch.cuda.empty_cache() |
|
|
|
|
|
|
|
return generated_tokens, full_output |
|
|
|
|
|
|
|
|
|
|
|
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]: |
|
|
@ -83,13 +160,21 @@ class Inference: |
|
|
|
# Start with the initial input tokens |
|
|
|
generated_tokens = input_ids # Initially, this is just the input tokens |
|
|
|
|
|
|
|
# past_key_values = DynamicCache() |
|
|
|
# max_cache_length = past_key_values.get_max_length() |
|
|
|
|
|
|
|
n = 0 |
|
|
|
try: |
|
|
|
|
|
|
|
# Loop to generate one token at a time |
|
|
|
while True: |
|
|
|
# Call the model with the current tokens |
|
|
|
outputs = self.model(input_ids=generated_tokens, use_cache=True) |
|
|
|
outputs = self.model( |
|
|
|
input_ids=generated_tokens, |
|
|
|
use_cache=True, |
|
|
|
num_beams = 1 |
|
|
|
# past_key_values=past_key_values |
|
|
|
) |
|
|
|
|
|
|
|
# Get the next token (the last token from the generated sequence) |
|
|
|
next_token = outputs.logits.argmax(dim=-1)[:, -1] |
|
|
|