different caching strategy
This commit is contained in:
6
llama.py
6
llama.py
@@ -39,7 +39,7 @@ def append_generate_chat(input_text: str, role="user"):
|
||||
|
||||
inputs = inference.tokenize(messages, tokenize=True)
|
||||
|
||||
outputs, out_text = inference.generate_incremental(inputs)
|
||||
outputs, out_text = inference.generate(inputs)
|
||||
|
||||
# append result to message history
|
||||
messages.append({"role": "assistant", "content": out_text})
|
||||
@@ -141,14 +141,14 @@ def main():
|
||||
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance
|
||||
# messages_temp[-1]["role"] = "user"
|
||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
||||
generated_tokens, full_output = inference.generate_incremental(input_ids)
|
||||
generated_tokens, full_output = inference.generate(input_ids)
|
||||
|
||||
elif input_text.startswith("/title"):
|
||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
||||
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
||||
messages_temp[-1]["role"] = "user"
|
||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
||||
generated_tokens, full_output = inference.generate_incremental(input_ids)
|
||||
generated_tokens, full_output = inference.generate(input_ids)
|
||||
|
||||
elif input_text.startswith("/help"):
|
||||
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
||||
|
Reference in New Issue
Block a user