lift inference part our of llama.py
This commit is contained in:
148
inference.py
Normal file
148
inference.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
if __name__ == "__main__":
|
||||||
|
# this message is at the start, because initializing torch/transformers takes lots of time. fail fast.
|
||||||
|
raise Exception("cannot execute this file directly")
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||||
|
import torch
|
||||||
|
import time
|
||||||
|
import utils
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Inference:
|
||||||
|
def __init__(self):
|
||||||
|
print("loading LLM...")
|
||||||
|
t_start = time.time()
|
||||||
|
|
||||||
|
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
|
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
|
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
||||||
|
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
||||||
|
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
||||||
|
|
||||||
|
|
||||||
|
quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||||
|
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||||
|
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||||
|
)
|
||||||
|
|
||||||
|
quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
|
|
||||||
|
# Load the model with quantization (optional)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
# device_map="auto", # Automatically places parts of the model on GPU/CPU
|
||||||
|
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||||
|
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||||
|
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
||||||
|
quantization_config=quantization_config_8bit
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load tokenizer
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
print("load took %.3fs" % (time.time() - t_start))
|
||||||
|
|
||||||
|
max_context_length = self.model.config.max_position_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
|
||||||
|
|
||||||
|
print("max_context_length is %d tokens." % (max_context_length))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
|
outputs = self.model.generate(
|
||||||
|
input_ids, # **inputs, inputs["input_ids"]
|
||||||
|
max_new_tokens=500, # max_length=max_context_length,
|
||||||
|
pad_token_id=self.tokenizer.pad_token_id,
|
||||||
|
eos_token_id=self.tokenizer.eos_token_id,
|
||||||
|
do_sample=True,
|
||||||
|
num_return_sequences=1
|
||||||
|
)
|
||||||
|
# skip all input tokens and only output the additional generated part of the conversation
|
||||||
|
input_token_count = len(input_ids[0])
|
||||||
|
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
||||||
|
print(out_text)
|
||||||
|
return outputs, out_text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
|
with torch.inference_mode():
|
||||||
|
return self._generate_incremental(input_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||||
|
# Start with the initial input tokens
|
||||||
|
generated_tokens = input_ids # Initially, this is just the input tokens
|
||||||
|
|
||||||
|
n = 0
|
||||||
|
try:
|
||||||
|
|
||||||
|
# Loop to generate one token at a time
|
||||||
|
while True:
|
||||||
|
# Call the model with the current tokens
|
||||||
|
outputs = self.model(input_ids=generated_tokens, use_cache=True)
|
||||||
|
|
||||||
|
# Get the next token (the last token from the generated sequence)
|
||||||
|
next_token = outputs.logits.argmax(dim=-1)[:, -1]
|
||||||
|
|
||||||
|
# Append the new token to the sequence
|
||||||
|
generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
|
||||||
|
|
||||||
|
# Decode and print the newly generated token (skip special tokens)
|
||||||
|
out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
||||||
|
print(out_text, end="", flush=True) # Print without newline
|
||||||
|
|
||||||
|
# Check if the generated token is the end-of-sequence token
|
||||||
|
if next_token.item() == self.tokenizer.eos_token_id:
|
||||||
|
print("")
|
||||||
|
break
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
if n >= 15:
|
||||||
|
n = 0
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Once done, return the full generated sequence
|
||||||
|
input_token_count = len(input_ids[0])
|
||||||
|
full_output = self.tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
|
||||||
|
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return generated_tokens, full_output
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(self, messages: list[dict], tokenize: bool) -> str | torch.Tensor:
|
||||||
|
if tokenize:
|
||||||
|
inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True) #continue_final_message=True,
|
||||||
|
inputs = {key: value.to(self.model.device) for key, value in inputs.items()}
|
||||||
|
return inputs["input_ids"]
|
||||||
|
else:
|
||||||
|
message = self.tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
def generate_tool_use_header(self, tools: list[callable]) -> str:
|
||||||
|
temp_messages = [{}] # for some reason an empty array is not allowed but a {} inside works like an empty array.
|
||||||
|
s = self.tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
|
||||||
|
pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
|
||||||
|
match = re.search(pattern, s, re.DOTALL)
|
||||||
|
if not match:
|
||||||
|
raise Exception("Failed to regex match the template tool system text.")
|
||||||
|
extraction = match.group(1)
|
||||||
|
return extraction
|
||||||
|
|
||||||
|
|
||||||
|
def torch_reseed(seed: int):
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
209
llama.py
209
llama.py
@@ -1,54 +1,55 @@
|
|||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||||
import time
|
import time
|
||||||
import torch
|
# import torch
|
||||||
import random
|
import random
|
||||||
from tool_helper import tool_list, parse_and_execute_tool_call
|
from tool_helper import tool_list, parse_and_execute_tool_call
|
||||||
from tool_functions import register_dummy
|
from tool_functions import register_dummy
|
||||||
import utils
|
# import utils
|
||||||
import re
|
# import re
|
||||||
|
from inference import Inference, torch_reseed
|
||||||
|
|
||||||
t_start = time.time()
|
# t_start = time.time()
|
||||||
|
|
||||||
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
# # model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
# model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||||
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
# # model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
||||||
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
# # model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
||||||
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
# # "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
||||||
|
|
||||||
|
|
||||||
quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
# quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||||
load_in_4bit=True,
|
# load_in_4bit=True,
|
||||||
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
# bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||||
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
# bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||||
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
# bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||||
)
|
# )
|
||||||
|
|
||||||
quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
|
|
||||||
# Load the model with quantization (optional)
|
# # Load the model with quantization (optional)
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
# model = AutoModelForCausalLM.from_pretrained(
|
||||||
model_name,
|
# model_name,
|
||||||
# device_map="auto", # Automatically places parts of the model on GPU/CPU
|
# # device_map="auto", # Automatically places parts of the model on GPU/CPU
|
||||||
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
# # device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||||
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||||
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
# # load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
||||||
quantization_config=quantization_config_8bit
|
# quantization_config=quantization_config_8bit
|
||||||
)
|
# )
|
||||||
|
|
||||||
# Load tokenizer
|
# # Load tokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
print("load took %.3fs" % (time.time() - t_start))
|
# print("load took %.3fs" % (time.time() - t_start))
|
||||||
|
|
||||||
max_context_length = model.config.max_position_embeddings
|
# max_context_length = model.config.max_position_embeddings
|
||||||
|
|
||||||
|
|
||||||
tokenizer.chat_template = utils.load_json_file("chat_template.json")
|
# tokenizer.chat_template = utils.load_json_file("chat_template.json")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print("max_context_length is %d tokens." % (max_context_length))
|
# print("max_context_length is %d tokens." % (max_context_length))
|
||||||
|
|
||||||
|
|
||||||
# Generate text
|
# Generate text
|
||||||
@@ -95,6 +96,8 @@ messages = [
|
|||||||
# {"role": "user", "content": "Hello, who are you?"}
|
# {"role": "user", "content": "Hello, who are you?"}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
inference = None
|
||||||
|
|
||||||
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."
|
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences."
|
||||||
|
|
||||||
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
|
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
|
||||||
@@ -106,67 +109,67 @@ register_dummy()
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_batch(inputs):
|
# def generate_batch(inputs):
|
||||||
outputs = model.generate(
|
# outputs = model.generate(
|
||||||
inputs["input_ids"], # **inputs,
|
# inputs["input_ids"], # **inputs,
|
||||||
max_new_tokens=500, # max_length=max_context_length,
|
# max_new_tokens=500, # max_length=max_context_length,
|
||||||
pad_token_id=tokenizer.pad_token_id,
|
# pad_token_id=tokenizer.pad_token_id,
|
||||||
eos_token_id=tokenizer.eos_token_id,
|
# eos_token_id=tokenizer.eos_token_id,
|
||||||
do_sample=True,
|
# do_sample=True,
|
||||||
num_return_sequences=1
|
# num_return_sequences=1
|
||||||
)
|
# )
|
||||||
# skip all input tokens and only output the additional generated part of the conversation
|
# # skip all input tokens and only output the additional generated part of the conversation
|
||||||
input_token_count = len(inputs["input_ids"][0])
|
# input_token_count = len(inputs["input_ids"][0])
|
||||||
out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
# out_text = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
||||||
print(out_text)
|
# print(out_text)
|
||||||
return outputs, out_text
|
# return outputs, out_text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_incremental(inputs):
|
# def generate_incremental(inputs):
|
||||||
# Start with the initial input tokens
|
# # Start with the initial input tokens
|
||||||
input_ids = inputs["input_ids"]
|
# input_ids = inputs["input_ids"]
|
||||||
generated_tokens = input_ids # Initially, this is just the input tokens
|
# generated_tokens = input_ids # Initially, this is just the input tokens
|
||||||
|
|
||||||
n = 0
|
# n = 0
|
||||||
try:
|
# try:
|
||||||
|
|
||||||
# Loop to generate one token at a time
|
# # Loop to generate one token at a time
|
||||||
while True:
|
# while True:
|
||||||
# Call the model with the current tokens
|
# # Call the model with the current tokens
|
||||||
outputs = model(input_ids=generated_tokens, use_cache=True)
|
# outputs = model(input_ids=generated_tokens, use_cache=True)
|
||||||
|
|
||||||
# Get the next token (the last token from the generated sequence)
|
# # Get the next token (the last token from the generated sequence)
|
||||||
next_token = outputs.logits.argmax(dim=-1)[:, -1]
|
# next_token = outputs.logits.argmax(dim=-1)[:, -1]
|
||||||
|
|
||||||
# Append the new token to the sequence
|
# # Append the new token to the sequence
|
||||||
generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
|
# generated_tokens = torch.cat([generated_tokens, next_token.unsqueeze(0)], dim=1)
|
||||||
|
|
||||||
# Decode and print the newly generated token (skip special tokens)
|
# # Decode and print the newly generated token (skip special tokens)
|
||||||
out_text = tokenizer.decode(next_token, skip_special_tokens=True)
|
# out_text = tokenizer.decode(next_token, skip_special_tokens=True)
|
||||||
print(out_text, end="", flush=True) # Print without newline
|
# print(out_text, end="", flush=True) # Print without newline
|
||||||
|
|
||||||
# Check if the generated token is the end-of-sequence token
|
# # Check if the generated token is the end-of-sequence token
|
||||||
if next_token.item() == tokenizer.eos_token_id:
|
# if next_token.item() == tokenizer.eos_token_id:
|
||||||
print("")
|
# print("")
|
||||||
break
|
# break
|
||||||
|
|
||||||
n += 1
|
# n += 1
|
||||||
if n >= 15:
|
# if n >= 15:
|
||||||
n = 0
|
# n = 0
|
||||||
torch.cuda.empty_cache()
|
# torch.cuda.empty_cache()
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
# except KeyboardInterrupt:
|
||||||
pass
|
# pass
|
||||||
|
|
||||||
|
|
||||||
# Once done, return the full generated sequence
|
# # Once done, return the full generated sequence
|
||||||
input_token_count = len(inputs["input_ids"][0])
|
# input_token_count = len(inputs["input_ids"][0])
|
||||||
full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
|
# full_output = tokenizer.decode(generated_tokens[0][input_token_count:], skip_special_tokens=True)
|
||||||
|
|
||||||
torch.cuda.empty_cache()
|
# torch.cuda.empty_cache()
|
||||||
|
|
||||||
return generated_tokens, full_output
|
# return generated_tokens, full_output
|
||||||
|
|
||||||
|
|
||||||
def append_generate_chat(input_text: str, role="user"):
|
def append_generate_chat(input_text: str, role="user"):
|
||||||
@@ -176,15 +179,16 @@ def append_generate_chat(input_text: str, role="user"):
|
|||||||
if input_text != None:
|
if input_text != None:
|
||||||
messages.append({"role": role, "content": input_text})
|
messages.append({"role": role, "content": input_text})
|
||||||
|
|
||||||
# input_text = "Hello, who are you?"
|
# # input_text = "Hello, who are you?"
|
||||||
# inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu")
|
# # inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # .to("cuda") .to("cpu")
|
||||||
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True) #continue_final_message=True,
|
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True, return_dict=True, add_generation_prompt=True) #continue_final_message=True,
|
||||||
inputs = {key: value.to(model.device) for key, value in inputs.items()}
|
# inputs = {key: value.to(model.device) for key, value in inputs.items()}
|
||||||
# inputs = {key: value.to("cpu") for key, value in inputs.items()}
|
# # inputs = {key: value.to("cpu") for key, value in inputs.items()}
|
||||||
# inputs["input_ids"] = inputs["input_ids"][:, 1:]
|
# # inputs["input_ids"] = inputs["input_ids"][:, 1:]
|
||||||
|
|
||||||
with torch.inference_mode():
|
inputs = inference.tokenize(messages, tokenize=True)
|
||||||
outputs, out_text = generate_incremental(inputs)
|
|
||||||
|
outputs, out_text = inference.generate_incremental(inputs)
|
||||||
|
|
||||||
# append result to message history
|
# append result to message history
|
||||||
messages.append({"role": "assistant", "content": out_text})
|
messages.append({"role": "assistant", "content": out_text})
|
||||||
@@ -202,21 +206,24 @@ def append_generate_chat(input_text: str, role="user"):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_tool_use_header(tools: list[callable]) -> str:
|
# def generate_tool_use_header(tools: list[callable]) -> str:
|
||||||
temp_messages = [{}] # for some reason an empty array is not allowed but a {} inside works like an empty array.
|
# temp_messages = [{}] # for some reason an empty array is not allowed but a {} inside works like an empty array.
|
||||||
s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
|
# s = tokenizer.apply_chat_template(temp_messages, return_tensors="pt", tokenize=False, add_generation_prompt=False, tools=tools)
|
||||||
pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
|
# pattern = r"<\|im_start\|>system\n(.*)<\|im_end\|>"
|
||||||
match = re.search(pattern, s, re.DOTALL)
|
# match = re.search(pattern, s, re.DOTALL)
|
||||||
if not match:
|
# if not match:
|
||||||
raise Exception("Failed to regex match the template tool system text.")
|
# raise Exception("Failed to regex match the template tool system text.")
|
||||||
extraction = match.group(1)
|
# extraction = match.group(1)
|
||||||
return extraction
|
# return extraction
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global messages
|
global messages
|
||||||
|
global inference
|
||||||
|
|
||||||
messages = [{"role": "system", "content": systemmessage + "\n" + generate_tool_use_header(tool_list)}]
|
inference = Inference()
|
||||||
|
|
||||||
|
messages = [{"role": "system", "content": systemmessage + "\n" + inference.generate_tool_use_header(tool_list)}]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
# print an input prompt to receive text or commands
|
# print an input prompt to receive text or commands
|
||||||
@@ -235,7 +242,8 @@ def main():
|
|||||||
print("")
|
print("")
|
||||||
|
|
||||||
elif input_text.startswith("/history"):
|
elif input_text.startswith("/history"):
|
||||||
history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
history = inference.tokenize(messages, tokenize=False)
|
||||||
|
# history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
||||||
print(history)
|
print(history)
|
||||||
|
|
||||||
elif input_text.startswith("/undo"):
|
elif input_text.startswith("/undo"):
|
||||||
@@ -251,8 +259,7 @@ def main():
|
|||||||
print("regenerating message (not working)")
|
print("regenerating message (not working)")
|
||||||
messages = messages[:-1]
|
messages = messages[:-1]
|
||||||
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
||||||
torch.manual_seed(seed)
|
torch_reseed(seed)
|
||||||
torch.cuda.manual_seed_all(seed)
|
|
||||||
append_generate_chat(None)
|
append_generate_chat(None)
|
||||||
else:
|
else:
|
||||||
print("cannot regenerate because there are not enough messages on history.")
|
print("cannot regenerate because there are not enough messages on history.")
|
||||||
@@ -304,3 +311,7 @@ def main():
|
|||||||
else:
|
else:
|
||||||
append_generate_chat(input_text)
|
append_generate_chat(input_text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user