different caching strategy

2025-01-04 15:47:43 +01:00
parent 18aec52501
commit 78b24d8f9f
2 changed files with 90 additions and 5 deletions
--- a/inference.py
+++ b/inference.py
@@ -4,6 +4,14 @@ if __name__ == "__main__":


 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers.cache_utils import (
+    DynamicCache,
+    SinkCache,
+    StaticCache,
+    SlidingWindowCache,
+    QuantoQuantizedCache,
+    QuantizedCacheConfig,
+)
 import torch
 import time
 import utils
@@ -20,6 +28,7 @@ class Inference:

        # model_name = "NousResearch/Llama-2-7b-hf"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
        model_name = "NousResearch/Hermes-3-Llama-3.2-3B"  # will cache on C:\Users\ftobler\.cache\huggingface\hub
+        # model_name = "gpt2"
        # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
        # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
        # "meta-llama/Llama-2-7b-hf"  # Replace with your chosen model
@@ -44,6 +53,11 @@ class Inference:
            quantization_config=quantization_config_8bit
        )

+        # print("apply optimization")
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.forward = torch.compile(self.model.forward, mode="reduce-overhead", fullgraph=True)
+
+
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

@@ -57,6 +71,11 @@ class Inference:
        print("max_context_length is %d tokens." % (max_context_length))


+    def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+        with torch.inference_mode():
+            return self.generate_incremental_2(input_ids)
+
+
    def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
        outputs = self.model.generate(
            input_ids,  # **inputs, inputs["input_ids"]
@@ -64,14 +83,72 @@ class Inference:
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=True,
-            num_return_sequences=1
+            num_return_sequences=1,
+            num_beams = 1
        )
        # skip all input tokens and only output the additional generated part of the conversation
        input_token_count = len(input_ids[0])
        out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
        print(out_text)
        return outputs, out_text
+    

+    def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
+        generated_tokens = input_ids
+
+        # past_key_values = DynamicCache()
+        past_key_values = StaticCache(config=self.model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
+
+        # n = 0
+        try:
+            while True:
+                outputs = self.model.generate(
+                    generated_tokens,  # **inputs, inputs["input_ids"]
+                    max_new_tokens=10,  # like streaming
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    do_sample=True,
+                    num_return_sequences=1,
+                    num_beams = 1,
+                    use_cache=True,
+                    past_key_values=past_key_values
+                )
+                # past_key_values = outputs.past_key_values
+
+                # Get the next token (the last token from the generated sequence)
+                # next_token = outputs.argmax(dim=-1)[:, -1]
+                new_tokens = outputs[0, len(generated_tokens[0]):]
+                # next_token = outputs[0,-1]
+
+                # Append the new token to the sequence
+                generated_tokens = outputs
+                # generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0).unsqueeze(0)], dim=1)
+
+                # Decode and print the newly generated token (skip special tokens)
+                # out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
+                out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
+                print(out_text, end="", flush=True)  # Print without newline
+
+                # Check if the generated token is the end-of-sequence token
+                # if next_token.item() == self.tokenizer.eos_token_id:
+                if new_tokens[-1].item() == self.tokenizer.eos_token_id:
+                    print("")
+                    break
+
+                # n += 1
+                # if n >= 15:
+                #     n = 0
+                # torch.cuda.empty_cache()
+        except KeyboardInterrupt:
+            pass
+
+        # Once done, return the full generated sequence
+        input_token_count = len(input_ids[0])
+        full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
+
+        # torch.cuda.empty_cache()
+
+        return generated_tokens, full_output


    def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
@@ -83,13 +160,21 @@ class Inference:
        # Start with the initial input tokens
        generated_tokens = input_ids  # Initially, this is just the input tokens

+        # past_key_values = DynamicCache()
+        # max_cache_length = past_key_values.get_max_length()
+
        n = 0
        try:

            # Loop to generate one token at a time
            while True:
                # Call the model with the current tokens
-                outputs = self.model(input_ids=generated_tokens, use_cache=True)
+                outputs = self.model(
+                    input_ids=generated_tokens, 
+                    use_cache=True,
+                    num_beams = 1
+                    # past_key_values=past_key_values
+                )

                # Get the next token (the last token from the generated sequence)
                next_token = outputs.logits.argmax(dim=-1)[:, -1]
--- a/llama.py
+++ b/llama.py
@@ -39,7 +39,7 @@ def append_generate_chat(input_text: str, role="user"):

    inputs = inference.tokenize(messages, tokenize=True)

-    outputs, out_text = inference.generate_incremental(inputs)
+    outputs, out_text = inference.generate(inputs)

    # append result to message history
    messages.append({"role": "assistant", "content": out_text})
@@ -141,14 +141,14 @@ def main():
            messages_temp = [summarize] + messages_temp + [summarize_user]  # copy dict in last instance
            # messages_temp[-1]["role"] = "user"
            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
-            generated_tokens, full_output = inference.generate_incremental(input_ids)
+            generated_tokens, full_output = inference.generate(input_ids)

        elif input_text.startswith("/title"):
            messages_temp = list(filter(lambda x: x["role"] != "system", messages))
            messages_temp = [title_prompt] + messages_temp #+ [dict(title)]  # copy dict in last instance
            messages_temp[-1]["role"] = "user"
            input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
-            generated_tokens, full_output = inference.generate_incremental(input_ids)
+            generated_tokens, full_output = inference.generate(input_ids)

        elif input_text.startswith("/help"):
            print("!<prompt>   answer as 'tool' in <tool_response> tags")