different caching strategy
This commit is contained in:
89
inference.py
89
inference.py
@@ -4,6 +4,14 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||||
|
from transformers.cache_utils import (
|
||||||
|
DynamicCache,
|
||||||
|
SinkCache,
|
||||||
|
StaticCache,
|
||||||
|
SlidingWindowCache,
|
||||||
|
QuantoQuantizedCache,
|
||||||
|
QuantizedCacheConfig,
|
||||||
|
)
|
||||||
import torch
|
import torch
|
||||||
import time
|
import time
|
||||||
import utils
|
import utils
|
||||||
@@ -20,6 +28,7 @@ class Inference:
|
|||||||
|
|
||||||
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
|
# model_name = "gpt2"
|
||||||
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
||||||
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
||||||
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
||||||
@@ -44,6 +53,11 @@ class Inference:
|
|||||||
quantization_config=quantization_config_8bit
|
quantization_config=quantization_config_8bit
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# print("apply optimization")
|
||||||
|
# self.model.generation_config.cache_implementation = "static"
|
||||||
|
# self.model.forward = torch.compile(self.model.forward, mode="reduce-overhead", fullgraph=True)
|
||||||
|
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
@@ -57,6 +71,11 @@ class Inference:
|
|||||||
print("max_context_length is %d tokens." % (max_context_length))
|
print("max_context_length is %d tokens." % (max_context_length))
|
||||||
|
|
||||||
|
|
||||||
|
def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
|
with torch.inference_mode():
|
||||||
|
return self.generate_incremental_2(input_ids)
|
||||||
|
|
||||||
|
|
||||||
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
outputs = self.model.generate(
|
outputs = self.model.generate(
|
||||||
input_ids, # **inputs, inputs["input_ids"]
|
input_ids, # **inputs, inputs["input_ids"]
|
||||||
@@ -64,14 +83,72 @@ class Inference:
|
|||||||
pad_token_id=self.tokenizer.pad_token_id,
|
pad_token_id=self.tokenizer.pad_token_id,
|
||||||
eos_token_id=self.tokenizer.eos_token_id,
|
eos_token_id=self.tokenizer.eos_token_id,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
num_return_sequences=1
|
num_return_sequences=1,
|
||||||
|
num_beams = 1
|
||||||
)
|
)
|
||||||
# skip all input tokens and only output the additional generated part of the conversation
|
# skip all input tokens and only output the additional generated part of the conversation
|
||||||
input_token_count = len(input_ids[0])
|
input_token_count = len(input_ids[0])
|
||||||
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
||||||
print(out_text)
|
print(out_text)
|
||||||
return outputs, out_text
|
return outputs, out_text
|
||||||
|
|
||||||
|
|
||||||
|
def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
|
generated_tokens = input_ids
|
||||||
|
|
||||||
|
# past_key_values = DynamicCache()
|
||||||
|
past_key_values = StaticCache(config=self.model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
# n = 0
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
outputs = self.model.generate(
|
||||||
|
generated_tokens, # **inputs, inputs["input_ids"]
|
||||||
|
max_new_tokens=10, # like streaming
|
||||||
|
pad_token_id=self.tokenizer.pad_token_id,
|
||||||
|
eos_token_id=self.tokenizer.eos_token_id,
|
||||||
|
do_sample=True,
|
||||||
|
num_return_sequences=1,
|
||||||
|
num_beams = 1,
|
||||||
|
use_cache=True,
|
||||||
|
past_key_values=past_key_values
|
||||||
|
)
|
||||||
|
# past_key_values = outputs.past_key_values
|
||||||
|
|
||||||
|
# Get the next token (the last token from the generated sequence)
|
||||||
|
# next_token = outputs.argmax(dim=-1)[:, -1]
|
||||||
|
new_tokens = outputs[0, len(generated_tokens[0]):]
|
||||||
|
# next_token = outputs[0,-1]
|
||||||
|
|
||||||
|
# Append the new token to the sequence
|
||||||
|
generated_tokens = outputs
|
||||||
|
# generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0).unsqueeze(0)], dim=1)
|
||||||
|
|
||||||
|
# Decode and print the newly generated token (skip special tokens)
|
||||||
|
# out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
||||||
|
out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||||
|
print(out_text, end="", flush=True) # Print without newline
|
||||||
|
|
||||||
|
# Check if the generated token is the end-of-sequence token
|
||||||
|
# if next_token.item() == self.tokenizer.eos_token_id:
|
||||||
|
if new_tokens[-1].item() == self.tokenizer.eos_token_id:
|
||||||
|
print("")
|
||||||
|
break
|
||||||
|
|
||||||
|
# n += 1
|
||||||
|
# if n >= 15:
|
||||||
|
# n = 0
|
||||||
|
# torch.cuda.empty_cache()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Once done, return the full generated sequence
|
||||||
|
input_token_count = len(input_ids[0])
|
||||||
|
full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
|
||||||
|
|
||||||
|
# torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return generated_tokens, full_output
|
||||||
|
|
||||||
|
|
||||||
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
@@ -83,13 +160,21 @@ class Inference:
|
|||||||
# Start with the initial input tokens
|
# Start with the initial input tokens
|
||||||
generated_tokens = input_ids # Initially, this is just the input tokens
|
generated_tokens = input_ids # Initially, this is just the input tokens
|
||||||
|
|
||||||
|
# past_key_values = DynamicCache()
|
||||||
|
# max_cache_length = past_key_values.get_max_length()
|
||||||
|
|
||||||
n = 0
|
n = 0
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# Loop to generate one token at a time
|
# Loop to generate one token at a time
|
||||||
while True:
|
while True:
|
||||||
# Call the model with the current tokens
|
# Call the model with the current tokens
|
||||||
outputs = self.model(input_ids=generated_tokens, use_cache=True)
|
outputs = self.model(
|
||||||
|
input_ids=generated_tokens,
|
||||||
|
use_cache=True,
|
||||||
|
num_beams = 1
|
||||||
|
# past_key_values=past_key_values
|
||||||
|
)
|
||||||
|
|
||||||
# Get the next token (the last token from the generated sequence)
|
# Get the next token (the last token from the generated sequence)
|
||||||
next_token = outputs.logits.argmax(dim=-1)[:, -1]
|
next_token = outputs.logits.argmax(dim=-1)[:, -1]
|
||||||
|
6
llama.py
6
llama.py
@@ -39,7 +39,7 @@ def append_generate_chat(input_text: str, role="user"):
|
|||||||
|
|
||||||
inputs = inference.tokenize(messages, tokenize=True)
|
inputs = inference.tokenize(messages, tokenize=True)
|
||||||
|
|
||||||
outputs, out_text = inference.generate_incremental(inputs)
|
outputs, out_text = inference.generate(inputs)
|
||||||
|
|
||||||
# append result to message history
|
# append result to message history
|
||||||
messages.append({"role": "assistant", "content": out_text})
|
messages.append({"role": "assistant", "content": out_text})
|
||||||
@@ -141,14 +141,14 @@ def main():
|
|||||||
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance
|
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance
|
||||||
# messages_temp[-1]["role"] = "user"
|
# messages_temp[-1]["role"] = "user"
|
||||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
||||||
generated_tokens, full_output = inference.generate_incremental(input_ids)
|
generated_tokens, full_output = inference.generate(input_ids)
|
||||||
|
|
||||||
elif input_text.startswith("/title"):
|
elif input_text.startswith("/title"):
|
||||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
||||||
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
||||||
messages_temp[-1]["role"] = "user"
|
messages_temp[-1]["role"] = "user"
|
||||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
||||||
generated_tokens, full_output = inference.generate_incremental(input_ids)
|
generated_tokens, full_output = inference.generate(input_ids)
|
||||||
|
|
||||||
elif input_text.startswith("/help"):
|
elif input_text.startswith("/help"):
|
||||||
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
||||||
|
Reference in New Issue
Block a user