different caching strategy

2025-01-04 15:47:43 +01:00
parent 18aec52501
commit 78b24d8f9f
2 changed files with 90 additions and 5 deletions
--- a/inference.py
+++ b/inference.py
@@ -4,6 +4,14 @@ if __name__ == "__main__":
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from transformers.cache_utils import (
    DynamicCache,
    SinkCache,
    StaticCache,
    SlidingWindowCache,
    QuantoQuantizedCache,
    QuantizedCacheConfig,
 )
 import torch
 import time
 import utils
@@ -20,6 +28,7 @@ class Inference:
        # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
        model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
        # model_name = "gpt2"
        # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
        # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
        # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
@@ -44,6 +53,11 @@ class Inference:
            quantization_config=quantization_config_8bit
        )
        # print("apply optimization")
        # self.model.generation_config.cache_implementation = "static"
        # self.model.forward = torch.compile(self.model.forward, mode="reduce-overhead", fullgraph=True)
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -57,6 +71,11 @@ class Inference:
        print("max_context_length is %d tokens." % (max_context_length))
    def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        with torch.inference_mode():
            return self.generate_incremental_2(input_ids)
    def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        outputs = self.model.generate(
            input_ids,  # **inputs, inputs["input_ids"]
@@ -64,14 +83,72 @@ class Inference:
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=True,
-            num_return_sequences=1
+            num_return_sequences=1,
            num_beams = 1
        )
        # skip all input tokens and only output the additional generated part of the conversation
        input_token_count = len(input_ids[0])
        out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
        print(out_text)
        return outputs, out_text
    def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        generated_tokens = input_ids
        # past_key_values = DynamicCache()
        past_key_values = StaticCache(config=self.model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
        # n = 0
        try:
            while True:
                outputs = self.model.generate(
                    generated_tokens,  # **inputs, inputs["input_ids"]
                    max_new_tokens=10,  # like streaming
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    do_sample=True,
                    num_return_sequences=1,
                    num_beams = 1,
                    use_cache=True,
                    past_key_values=past_key_values
                )
                # past_key_values = outputs.past_key_values
                # Get the next token (the last token from the generated sequence)
                # next_token = outputs.argmax(dim=-1)[:, -1]
                new_tokens = outputs[0, len(generated_tokens[0]):]
                # next_token = outputs[0,-1]
                # Append the new token to the sequence
                generated_tokens = outputs
                # generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0).unsqueeze(0)], dim=1)
                # Decode and print the newly generated token (skip special tokens)
                # out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
                out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
                print(out_text, end="", flush=True)  # Print without newline
                # Check if the generated token is the end-of-sequence token
                # if next_token.item() == self.tokenizer.eos_token_id:
                if new_tokens[-1].item() == self.tokenizer.eos_token_id:
                    print("")
                    break
                # n += 1
                # if n >= 15:
                #     n = 0
                # torch.cuda.empty_cache()
        except KeyboardInterrupt:
            pass
        # Once done, return the full generated sequence
        input_token_count = len(input_ids[0])
        full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
        # torch.cuda.empty_cache()
        return generated_tokens, full_output
    def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
@@ -83,13 +160,21 @@ class Inference:
        # Start with the initial input tokens
        generated_tokens = input_ids  # Initially, this is just the input tokens
        # past_key_values = DynamicCache()
        # max_cache_length = past_key_values.get_max_length()
        n = 0
        try:
            # Loop to generate one token at a time
            while True:
                # Call the model with the current tokens
-                outputs = self.model(input_ids=generated_tokens, use_cache=True)
+                outputs = self.model(
                    input_ids=generated_tokens, 
                    use_cache=True,
                    num_beams = 1
                    # past_key_values=past_key_values
                )
                # Get the next token (the last token from the generated sequence)
                next_token = outputs.logits.argmax(dim=-1)[:, -1]
--- a/llama.py
+++ b/llama.py
@@ -39,7 +39,7 @@ def append_generate_chat(input_text: str, role="user"):
    inputs = inference.tokenize(messages, tokenize=True)
-    outputs, out_text = inference.generate_incremental(inputs)
+    outputs, out_text = inference.generate(inputs)
    # append result to message history
    messages.append({"role": "assistant", "content": out_text})
@@ -141,14 +141,14 @@ def main():
            messages_temp = [summarize] + messages_temp + [summarize_user]  # copy dict in last instance
            # messages_temp[-1]["role"] = "user"
            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
-            generated_tokens, full_output = inference.generate_incremental(input_ids)
+            generated_tokens, full_output = inference.generate(input_ids)
        elif input_text.startswith("/title"):
            messages_temp = list(filter(lambda x: x["role"] != "system", messages))
            messages_temp = [title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance
            messages_temp[-1]["role"] = "user"
            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
-            generated_tokens, full_output = inference.generate_incremental(input_ids)
+            generated_tokens, full_output = inference.generate(input_ids)
        elif input_text.startswith("/help"):
            print("!<prompt>   answer as 'tool' in <tool_response> tags")