Compare commits
12 Commits
adcb172da4
...
master
Author | SHA1 | Date | |
---|---|---|---|
5e3747179f | |||
44e5bd423e | |||
03c93f4d8b | |||
f9c4d3e2db | |||
7224111a0b | |||
0c022d4731 | |||
a697f49698 | |||
3218e7eb63 | |||
ef789375c8 | |||
7f0cb49156 | |||
19870cdea8 | |||
677eb6d0ea |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
/model/*
|
||||
*.prof
|
||||
__pycache__
|
||||
*.venv
|
||||
*.venv
|
||||
*.egg-info
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@@ -15,7 +15,7 @@
|
||||
"name": "PyDebug: __main__.py",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "__main__.py",
|
||||
"program": "chatbug/__main__.py",
|
||||
"console": "integratedTerminal"
|
||||
}
|
||||
]
|
||||
|
0
chatbug/__init__.py
Normal file
0
chatbug/__init__.py
Normal file
@@ -1,6 +1,7 @@
|
||||
print("running __main__.-py")
|
||||
|
||||
from llama import main
|
||||
from chatbug.llama import main_func
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main_func()
|
37
chatbug/download_model.py
Normal file
37
chatbug/download_model.py
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
|
||||
from chatbug.inference import Inference
|
||||
from chatbug.modelconfig import Modelconfig
|
||||
|
||||
|
||||
def main():
|
||||
# Model size: 3.21B params
|
||||
Inference(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
|
||||
|
||||
# Model size: 1.24B params
|
||||
Inference(Modelconfig("unsloth/Llama-3.2-1B", load_in_8bit=True))
|
||||
|
||||
# Model size: 3.21B params
|
||||
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
|
||||
|
||||
# Model size: 4.65B params
|
||||
Inference(Modelconfig("unsloth/llama-3-8b-bnb-4bit", load_in_4bit=True))
|
||||
|
||||
# Model size: 3.21B params
|
||||
Inference(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_4bit=True))
|
||||
|
||||
# Model size: 5.21B params
|
||||
Inference(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit", load_in_4bit=True))
|
||||
|
||||
# Model size: 4.46B params
|
||||
Inference(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit", load_in_4bit=True))
|
||||
|
||||
# Model size: 3.09B params
|
||||
Inference(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
|
||||
|
||||
# Model size: 3.87B params
|
||||
Inference(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit", load_in_4bit=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
46
chatbug/file_append.py
Normal file
46
chatbug/file_append.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
|
||||
|
||||
def check_append_file(prompt: str) -> str:
|
||||
if "@" in prompt:
|
||||
parts = prompt.split(" ")
|
||||
content = []
|
||||
for part in parts:
|
||||
if part.startswith("@"):
|
||||
filename = part[1:]
|
||||
try:
|
||||
if os.path.exists(filename):
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
content.append("%s:'''\n%s'''" % (filename, f.read()))
|
||||
except FileNotFoundError:
|
||||
print(f"File '{filename}' not found.")
|
||||
except Exception as e:
|
||||
print("exception encountered %s", e)
|
||||
content.append(prompt)
|
||||
return "\n".join(content)
|
||||
return prompt
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit() # not accidentally trigger it
|
||||
|
||||
# Create some sample files
|
||||
with open("fmain.py", "w") as f:
|
||||
f.write("# This is main.py\n")
|
||||
with open("finference.py", "w") as f:
|
||||
f.write("# This is inference.py\n")
|
||||
|
||||
# Test cases
|
||||
test_prompts = [
|
||||
"@fmain.py",
|
||||
"@fmain.py @finference.py",
|
||||
"@fnonexistent.py",
|
||||
"@fmain.py @fnonexistent.py"
|
||||
]
|
||||
|
||||
for prompt in test_prompts:
|
||||
print(f"Testing prompt: {prompt}")
|
||||
result = check_append_file(prompt)
|
||||
print(f"Result: {result}")
|
||||
print("-" * 20)
|
170
chatbug/generation_loop.py
Normal file
170
chatbug/generation_loop.py
Normal file
@@ -0,0 +1,170 @@
|
||||
import time
|
||||
import json
|
||||
import random
|
||||
from chatbug.tool_helper import tool_list, parse_and_execute_tool_call
|
||||
from chatbug.inference import Inference, torch_reseed
|
||||
from chatbug.file_append import check_append_file
|
||||
|
||||
|
||||
|
||||
def msg(role: str, content: str) -> dict:
|
||||
return {"role": role, "content": content}
|
||||
|
||||
|
||||
class Terminal:
|
||||
|
||||
def __init__(self, inference: Inference, systemmessage: dict):
|
||||
self.inference = inference
|
||||
self.messages:list[dict] = [systemmessage]
|
||||
|
||||
# these are meant to be overwritten by better ones
|
||||
self.roleflip = msg("system", "keep going.")
|
||||
self.summarize = msg("system", "summarize conversation")
|
||||
self.summarize_user = msg("system", "please summarize conversation")
|
||||
self.title_prompt = msg("system", "create a title for this conversation")
|
||||
|
||||
def append_generate_chat(self, input_text: str, role="user"):
|
||||
t_start = time.time()
|
||||
|
||||
# generate AI response
|
||||
if input_text != None:
|
||||
self.messages.append({"role": role, "content": input_text})
|
||||
|
||||
inputs = self.inference.tokenize(self.messages, tokenize=True)
|
||||
number_of_input_tokens = inputs.shape[1]
|
||||
|
||||
outputs, out_text = self.inference.generate(inputs)
|
||||
|
||||
# append result to message history
|
||||
self.messages.append({"role": "assistant", "content": out_text})
|
||||
|
||||
print("")
|
||||
time_taken = time.time() - t_start
|
||||
number_of_tokens = len(outputs[0])
|
||||
tokens_per_second = (number_of_tokens - number_of_input_tokens) / time_taken
|
||||
print("generation took %.3fs (%d tokens, %.3f t/s)" % (time_taken, number_of_tokens, tokens_per_second))
|
||||
|
||||
# handle tool call and check if a tool call has happened.
|
||||
tool_result = parse_and_execute_tool_call(out_text, tool_list)
|
||||
if tool_result != None:
|
||||
# tool call happened
|
||||
tool_result = "<tool_response>%s</tool_response>" % tool_result
|
||||
# depending on the chat template the tool response tags must or must not be passed. :(
|
||||
self.append_generate_chat(tool_result, role="tool")
|
||||
|
||||
def join(self):
|
||||
|
||||
while True:
|
||||
# print an input prompt to receive text or commands
|
||||
input_text = input(">>> ")
|
||||
print("")
|
||||
|
||||
input_text = check_append_file(input_text)
|
||||
|
||||
|
||||
if input_text.startswith("!"):
|
||||
self.append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
|
||||
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :(
|
||||
|
||||
elif input_text.startswith("/clear"):
|
||||
print("clearing chat history")
|
||||
start_msg = self.messages[0]
|
||||
self.message = [start_msg]
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/history"):
|
||||
history = self.inference.tokenize(self.messages, tokenize=False)
|
||||
# history = tokenizer.apply_chat_template(self.message, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
||||
print(history)
|
||||
|
||||
elif input_text.startswith("/undo"):
|
||||
if len(self.messages) > 2:
|
||||
print("undo latest prompt")
|
||||
self.message = self.messages[:-2]
|
||||
else:
|
||||
print("cannot undo because there are not enough self.message on history.")
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/regen"):
|
||||
if len(self.messages) >= 2:
|
||||
print("regenerating message (not working)")
|
||||
self.messages = self.messages[:-1]
|
||||
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
||||
torch_reseed(seed)
|
||||
self.append_generate_chat(None)
|
||||
else:
|
||||
print("cannot regenerate because there are not enough self.message on history.")
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/more"):
|
||||
self.append_generate_chat(None)
|
||||
|
||||
elif input_text.startswith("/file"):
|
||||
filename = input_text[len("/file "):]
|
||||
print("read '%s' for prompt:" % filename)
|
||||
with open(filename, "r") as f:
|
||||
content = f.read()
|
||||
print(content)
|
||||
self.append_generate_chat(content)
|
||||
|
||||
elif input_text.startswith("/auto"):
|
||||
message_backup = self.messages
|
||||
self.messages = [self.roleflip]
|
||||
for m in self.message_backup:
|
||||
role = m["role"]
|
||||
content = m["content"]
|
||||
if role == "user":
|
||||
role = "assistant"
|
||||
elif role == "assistant":
|
||||
role = "user"
|
||||
if role != "system":
|
||||
self.message.append({"role": role, "content": content})
|
||||
self.append_generate_chat(None) # will automatically advance the conversation as 'user'
|
||||
last_message = self.messages[-1]
|
||||
last_message["role"] = "user"
|
||||
self.messages = message_backup + [last_message]
|
||||
self.append_generate_chat(None) # 'regular' chatbot answer
|
||||
|
||||
elif input_text.startswith("/summarize"):
|
||||
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
|
||||
messages_temp = [self.summarize] + messages_temp + [self.summarize_user] # copy dict in last instance
|
||||
# messages_temp[-1]["role"] = "user"
|
||||
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
||||
generated_tokens, full_output = self.inference.generate(input_ids)
|
||||
|
||||
elif input_text.startswith("/title"):
|
||||
messages_temp = list(filter(lambda x: x["role"] != "system", self.messages))
|
||||
messages_temp = [self.title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
||||
messages_temp[-1]["role"] = "user"
|
||||
input_ids = self.inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
||||
generated_tokens, full_output = self.inference.generate(input_ids)
|
||||
|
||||
elif input_text.startswith("/save"):
|
||||
with open("messages.json", "w") as f:
|
||||
json.dump(self.messages, f, indent=4)
|
||||
|
||||
elif input_text.startswith("/load"):
|
||||
with open("messages.json", "r") as f:
|
||||
new_messages = json.load(f)
|
||||
self.messages = [self.messages[0]] + new_messages[1:]
|
||||
|
||||
elif input_text.startswith("/help"):
|
||||
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
||||
print("/clear clear chat history")
|
||||
print("/undo undo latest prompt")
|
||||
print("/regen regenerate the last message")
|
||||
print("/more generate more additional information")
|
||||
print("/file read prompt input from file")
|
||||
print("/auto automatically advance conversation")
|
||||
print("/summarize generate a summary of the chat")
|
||||
print("/title generate a title of the chat")
|
||||
print("/save write chat history to file")
|
||||
print("/load load previously saved history")
|
||||
print("/help print this message")
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/"):
|
||||
print("unknown command.")
|
||||
|
||||
else:
|
||||
self.append_generate_chat(input_text)
|
@@ -14,44 +14,52 @@ from transformers.cache_utils import (
|
||||
)
|
||||
import torch
|
||||
import time
|
||||
import utils
|
||||
import re
|
||||
import os
|
||||
import chatbug.utils as utils
|
||||
from chatbug.modelconfig import Modelconfig
|
||||
|
||||
torch.set_num_threads(os.cpu_count()) # Adjust this to the number of threads/cores you have
|
||||
|
||||
|
||||
class Inference:
|
||||
def __init__(self):
|
||||
print("loading LLM...")
|
||||
def __init__(self, modelconfig: Modelconfig):
|
||||
print("loading LLM '%s'..." % modelconfig.model_name)
|
||||
t_start = time.time()
|
||||
|
||||
# model_name = "NousResearch/Llama-2-7b-hf" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||
model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||
# model_name = "NousResearch/Hermes-3-Llama-3.2-3B" # will cache on C:\Users\ftobler\.cache\huggingface\hub
|
||||
# model_name = "unsloth/phi-4-unsloth-bnb-4bit" #too big
|
||||
# model_name = "gpt2"
|
||||
# model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
||||
# model_name = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
|
||||
# "meta-llama/Llama-2-7b-hf" # Replace with your chosen model
|
||||
|
||||
|
||||
quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||
)
|
||||
# quantization_config_4bit = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||
# load_in_4bit=True,
|
||||
# bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||
# bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||
# bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||
# )
|
||||
|
||||
quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
||||
# quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
|
||||
|
||||
# Load the model with quantization (optional)
|
||||
self.model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
# device_map="auto", # Automatically places parts of the model on GPU/CPU
|
||||
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
||||
quantization_config=quantization_config_8bit
|
||||
)
|
||||
if modelconfig.bits_and_bytes_config != None:
|
||||
self.model = AutoModelForCausalLM.from_pretrained(
|
||||
modelconfig.model_name,
|
||||
# device_map="auto", # Automatically places parts of the model on GPU/CPU
|
||||
# device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||
device_map="cuda", # Automatically places parts of the model on GPU/CPU
|
||||
# load_in_8bit=True, # Enables 8-bit quantization if bitsandbytes is installed
|
||||
quantization_config=modelconfig.bits_and_bytes_config
|
||||
)
|
||||
else:
|
||||
self.model = AutoModelForCausalLM.from_pretrained(
|
||||
modelconfig.model_name,
|
||||
device_map="cuda",
|
||||
)
|
||||
|
||||
# print("apply optimization")
|
||||
# self.model.generation_config.cache_implementation = "static"
|
||||
@@ -59,25 +67,25 @@ class Inference:
|
||||
|
||||
|
||||
# Load tokenizer
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(modelconfig.model_name)
|
||||
|
||||
print("load took %.3fs" % (time.time() - t_start))
|
||||
|
||||
max_context_length = self.model.config.max_position_embeddings
|
||||
self.max_context_length = self.model.config.max_position_embeddings
|
||||
|
||||
|
||||
self.tokenizer.chat_template = utils.load_json_file("chat_template.json")
|
||||
|
||||
print("max_context_length is %d tokens." % (max_context_length))
|
||||
print("max_context_length is %d tokens." % (self.max_context_length))
|
||||
|
||||
|
||||
def generate(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||
def generate(self, input_ids: torch.Tensor, print_stdout=True) -> tuple[torch.Tensor, str]:
|
||||
with torch.inference_mode():
|
||||
with torch.no_grad():
|
||||
return self.generate_incremental_2(input_ids)
|
||||
return self.generate_incremental_2(input_ids, print_stdout)
|
||||
|
||||
|
||||
def generate_batch(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||
def generate_batch(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||
outputs = self.model.generate(
|
||||
input_ids, # **inputs, inputs["input_ids"]
|
||||
max_new_tokens=500, # max_length=max_context_length,
|
||||
@@ -90,11 +98,12 @@ class Inference:
|
||||
# skip all input tokens and only output the additional generated part of the conversation
|
||||
input_token_count = len(input_ids[0])
|
||||
out_text = self.tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
|
||||
print(out_text)
|
||||
if print_stdout:
|
||||
print(out_text)
|
||||
return outputs, out_text
|
||||
|
||||
|
||||
def generate_incremental_2(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||
|
||||
def generate_incremental_2(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||
generated_tokens = input_ids
|
||||
|
||||
past_key_values = DynamicCache()
|
||||
@@ -126,12 +135,14 @@ class Inference:
|
||||
# Decode and print the newly generated token (skip special tokens)
|
||||
# out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
||||
out_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||
print(out_text, end="", flush=True) # Print without newline
|
||||
if print_stdout:
|
||||
print(out_text, end="", flush=True) # Print without newline
|
||||
|
||||
# Check if the generated token is the end-of-sequence token
|
||||
# if next_token.item() == self.tokenizer.eos_token_id:
|
||||
if new_tokens[-1].item() == self.tokenizer.eos_token_id:
|
||||
print("")
|
||||
if print_stdout:
|
||||
print("")
|
||||
break
|
||||
|
||||
# n += 1
|
||||
@@ -150,12 +161,12 @@ class Inference:
|
||||
return generated_tokens, full_output
|
||||
|
||||
|
||||
def generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||
def generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||
with torch.inference_mode():
|
||||
return self._generate_incremental(input_ids)
|
||||
return self._generate_incremental(input_ids, print_stdout)
|
||||
|
||||
|
||||
def _generate_incremental(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, str]:
|
||||
def _generate_incremental(self, input_ids: torch.Tensor, print_stdout:bool=True) -> tuple[torch.Tensor, str]:
|
||||
# Start with the initial input tokens
|
||||
generated_tokens = input_ids # Initially, this is just the input tokens
|
||||
|
||||
@@ -169,7 +180,7 @@ class Inference:
|
||||
while True:
|
||||
# Call the model with the current tokens
|
||||
outputs = self.model(
|
||||
input_ids=generated_tokens,
|
||||
input_ids=generated_tokens,
|
||||
use_cache=True,
|
||||
num_beams = 1
|
||||
# past_key_values=past_key_values
|
||||
@@ -183,11 +194,13 @@ class Inference:
|
||||
|
||||
# Decode and print the newly generated token (skip special tokens)
|
||||
out_text = self.tokenizer.decode(next_token, skip_special_tokens=True)
|
||||
print(out_text, end="", flush=True) # Print without newline
|
||||
if print_stdout:
|
||||
print(out_text, end="", flush=True) # Print without newline
|
||||
|
||||
# Check if the generated token is the end-of-sequence token
|
||||
if next_token.item() == self.tokenizer.eos_token_id:
|
||||
print("")
|
||||
if print_stdout:
|
||||
print("")
|
||||
break
|
||||
|
||||
n += 1
|
76
chatbug/inference_profile_experiement.py
Normal file
76
chatbug/inference_profile_experiement.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import time
|
||||
import nvidia_smi
|
||||
import torch
|
||||
import gc
|
||||
from chatbug.inference import Inference
|
||||
from chatbug.modelconfig import Modelconfig
|
||||
|
||||
|
||||
def empty_cuda():
|
||||
while True:
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
time.sleep(0.5)
|
||||
vram = nvidia_smi.get_gpu_stats()["memory_used"]
|
||||
print("vram: %d MB" % vram)
|
||||
if vram < 200:
|
||||
return
|
||||
|
||||
|
||||
def profile_ex(model_conf: Modelconfig):
|
||||
print("")
|
||||
empty_cuda()
|
||||
messages = [
|
||||
{"role": "system", "content": "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."},
|
||||
{"role": "user", "content": "How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?"},
|
||||
]
|
||||
|
||||
gpu_stats_before = nvidia_smi.get_gpu_stats()
|
||||
inference = Inference(model_conf)
|
||||
|
||||
gpu_stats_loaded = nvidia_smi.get_gpu_stats()
|
||||
t_start = time.time()
|
||||
input_ids = inference.tokenize(messages, tokenize=True)
|
||||
generated_tokens, full_output = inference.generate_batch(input_ids, print_stdout=False)
|
||||
t_end = time.time()
|
||||
gpu_stats_after = nvidia_smi.get_gpu_stats()
|
||||
|
||||
took = t_end - t_start
|
||||
tokens = len(generated_tokens[0])
|
||||
tokens_per = tokens / took
|
||||
vram_bulk = gpu_stats_loaded["memory_used"] - gpu_stats_before["memory_used"]
|
||||
vram_top = gpu_stats_after["memory_used"] - gpu_stats_loaded["memory_used"]
|
||||
print("model: %s" % model_conf.model_name)
|
||||
print("tokens: %d tk" % tokens)
|
||||
print("time: %.3f s" % took)
|
||||
print("speed: %.3f tk/s" % tokens_per)
|
||||
print("vram_bulk: %d MB" % vram_bulk)
|
||||
print("vram_top: %d MB" % vram_top)
|
||||
print("context: %d tk" % inference.max_context_length)
|
||||
print("")
|
||||
|
||||
|
||||
def profile(model_conf):
|
||||
try:
|
||||
profile_ex(model_conf)
|
||||
except Exception as e:
|
||||
print("exception: " + str(e))
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
profile(Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True))
|
||||
profile(Modelconfig("unsloth/Llama-3.2-1B"))
|
||||
profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True))
|
||||
profile(Modelconfig("unsloth/llama-3-8b-bnb-4bit"))
|
||||
# profile(Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True))
|
||||
profile(Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit"))
|
||||
profile(Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit"))
|
||||
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True))
|
||||
profile(Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True))
|
||||
profile(Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit"))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
46
chatbug/llama.py
Normal file
46
chatbug/llama.py
Normal file
@@ -0,0 +1,46 @@
|
||||
|
||||
|
||||
import datetime
|
||||
from chatbug.tool_helper import tool_list
|
||||
from chatbug.tool_functions import register_dummy
|
||||
from chatbug.inference import Inference
|
||||
from chatbug.generation_loop import Terminal, msg
|
||||
from chatbug import model_selection
|
||||
|
||||
|
||||
register_dummy()
|
||||
|
||||
|
||||
def initialize_config(inference: Inference) -> Terminal:
|
||||
|
||||
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
|
||||
system_prompt = "Hold a casual conversation with the user. Keep responses short at max 5 sentences and on point. Answer using markdown to the user. When providing code examples, avoid comments which provide no additional information. Do not summarize."
|
||||
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
|
||||
append_toolcalls = False
|
||||
if append_toolcalls:
|
||||
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list))
|
||||
else:
|
||||
systemmessage = msg("system", system_prompt + "\n" + current_date_and_time)
|
||||
|
||||
terminal = Terminal(inference, systemmessage)
|
||||
|
||||
# system message for role flip so the model automatically answers for the user
|
||||
terminal.roleflip = msg("system", "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye.")
|
||||
|
||||
# system messages and user message to bring the model to summarize the entire conversation
|
||||
terminal.summarize = msg("system", "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly.")
|
||||
terminal.summarize_user = msg("system", "Can you summarize the conversation?")
|
||||
|
||||
# system message to create a conversation title
|
||||
terminal.title_prompt = msg("system", "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity.")
|
||||
return terminal
|
||||
|
||||
|
||||
def main_func():
|
||||
inference = Inference(model_selection.get_model())
|
||||
terminal = initialize_config(inference)
|
||||
terminal.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main_func()
|
3
chatbug/matheval/__init__.py
Normal file
3
chatbug/matheval/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from chatbug.matheval import ast
|
||||
from chatbug.matheval import interpreter
|
||||
from chatbug.matheval import lexer
|
@@ -1,6 +1,5 @@
|
||||
|
||||
import math_lexer as lexer
|
||||
from math_lexer import Token
|
||||
from chatbug.matheval import lexer
|
||||
from chatbug.matheval.lexer import Token
|
||||
|
||||
|
||||
class Statement:
|
@@ -1,10 +1,11 @@
|
||||
import math_ast as ast
|
||||
|
||||
|
||||
|
||||
from sympy.parsing.sympy_parser import parse_expr
|
||||
from sympy.core.numbers import Integer, One, Zero
|
||||
from sympy import symbols, Eq, solveset, linsolve, nonlinsolve
|
||||
from sympy.core.symbol import Symbol
|
||||
from chatbug.matheval import ast
|
||||
|
||||
|
||||
def interpret(statement: ast.Statement) -> str:
|
95
chatbug/model_selection.py
Normal file
95
chatbug/model_selection.py
Normal file
@@ -0,0 +1,95 @@
|
||||
|
||||
from chatbug.modelconfig import Modelconfig
|
||||
|
||||
|
||||
|
||||
def get_model() -> Modelconfig:
|
||||
|
||||
# model: NousResearch/Hermes-3-Llama-3.2-3B
|
||||
# tokens: 315 tk
|
||||
# time: 94.360 s
|
||||
# speed: 3.338 tk/s
|
||||
# vram_bulk: 3622 MB
|
||||
# vram_top: 80 MB
|
||||
# context: 131072 tk
|
||||
# model = Modelconfig("NousResearch/Hermes-3-Llama-3.2-3B", load_in_8bit=True)
|
||||
|
||||
# model: unsloth/Llama-3.2-1B
|
||||
# tokens: 589 tk
|
||||
# time: 39.348 s
|
||||
# speed: 14.969 tk/s
|
||||
# vram_bulk: 4708 MB
|
||||
# vram_top: 102 MB
|
||||
# context: 131072 tk
|
||||
# model = Modelconfig("unsloth/Llama-3.2-1B") # note, fast, but talks to itself. basically does not work.
|
||||
|
||||
# model: unsloth/Llama-3.2-3B-Instruct
|
||||
# tokens: 285 tk
|
||||
# time: 75.363 s
|
||||
# speed: 3.782 tk/s
|
||||
# vram_bulk: 3512 MB
|
||||
# vram_top: 48 MB
|
||||
# context: 131072 tk
|
||||
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct", load_in_8bit=True)
|
||||
|
||||
# model: unsloth/llama-3-8b-bnb-4bit
|
||||
# tokens: 435 tk
|
||||
# time: 84.314 s
|
||||
# speed: 5.159 tk/s
|
||||
# vram_bulk: 5440 MB
|
||||
# vram_top: 216 MB
|
||||
# context: 8192 tk
|
||||
# model = Modelconfig("unsloth/llama-3-8b-bnb-4bit")
|
||||
|
||||
# Model size: 3.21B params
|
||||
# vram used: xxxxx MB
|
||||
# speed xxxxx t/s
|
||||
# working: DOES NOT LOAD
|
||||
# model = Modelconfig("unsloth/Llama-3.2-3B-Instruct-GGUF", load_in_8bit=True)
|
||||
|
||||
# model: unsloth/gemma-2-9b-it-bnb-4bit
|
||||
# tokens: 154 tk
|
||||
# time: 32.727 s
|
||||
# speed: 4.706 tk/s
|
||||
# vram_bulk: 6156 MB
|
||||
# vram_top: 232 MB
|
||||
# context: 8192 tk
|
||||
# model = Modelconfig("unsloth/gemma-2-9b-it-bnb-4bit")
|
||||
|
||||
# model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
|
||||
# tokens: 120 tk
|
||||
# time: 12.248 s
|
||||
# speed: 9.798 tk/s
|
||||
# vram_bulk: 5382 MB
|
||||
# vram_top: 170 MB
|
||||
# context: 32768 tk
|
||||
model = Modelconfig("unsloth/Qwen2.5-7B-Instruct-bnb-4bit") # note, this works really good
|
||||
|
||||
# model: unsloth/Qwen2.5-3B-Instruct
|
||||
# tokens: 112 tk
|
||||
# time: 12.703 s
|
||||
# speed: 8.816 tk/s
|
||||
# vram_bulk: 2108 MB
|
||||
# vram_top: 98 MB
|
||||
# context: 32768 tk
|
||||
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_4bit=True)
|
||||
|
||||
# model: unsloth/Qwen2.5-3B-Instruct
|
||||
# tokens: 118 tk
|
||||
# time: 33.748 s
|
||||
# speed: 3.497 tk/s
|
||||
# vram_bulk: 3310 MB
|
||||
# vram_top: 60 MB
|
||||
# context: 32768 tk
|
||||
# model = Modelconfig("unsloth/Qwen2.5-3B-Instruct", load_in_8bit=True)
|
||||
|
||||
# Model size: 3.87B params
|
||||
# vram used: xxxxx MB
|
||||
# speed xxxxx t/s
|
||||
# error: requires the protobuf library but it was not found in your environment
|
||||
# model = Modelconfig("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
|
20
chatbug/modelconfig.py
Normal file
20
chatbug/modelconfig.py
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
from transformers import BitsAndBytesConfig
|
||||
import torch
|
||||
|
||||
class Modelconfig:
|
||||
def __init__(self, model_name, bits_and_bytes_config=None, load_in_8bit=False, load_in_4bit=False):
|
||||
self.model_name = model_name
|
||||
if load_in_4bit:
|
||||
assert bits_and_bytes_config == None
|
||||
self.bits_and_bytes_config = BitsAndBytesConfig( # tool calls don't really work in 4 bit mode
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4", # Recommended for better performance
|
||||
bnb_4bit_use_double_quant=True, # Optional: Further quantization for more memory saving
|
||||
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
|
||||
)
|
||||
elif load_in_8bit:
|
||||
assert bits_and_bytes_config == None
|
||||
self.bits_and_bytes_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||
else:
|
||||
self.bits_and_bytes_config = bits_and_bytes_config
|
@@ -1,10 +1,8 @@
|
||||
import random
|
||||
import datetime
|
||||
from tool_helper import tool
|
||||
import math_lexer
|
||||
import math_ast
|
||||
import math_interpreter
|
||||
import utils
|
||||
from chatbug.tool_helper import tool
|
||||
import chatbug.matheval as matheval
|
||||
import chatbug.utils as utils
|
||||
|
||||
|
||||
# @tool
|
||||
@@ -39,10 +37,10 @@ def math_evaluate(expression: str):
|
||||
Args:
|
||||
expression: A valid arithmetic expression (e.g., '2 + 3 * 4'). The expression must not contain '='."""
|
||||
try:
|
||||
tokens = math_lexer.tokenize(expression)
|
||||
parser = math_ast.Parser()
|
||||
tokens = matheval.lexer.tokenize(expression)
|
||||
parser = matheval.ast.Parser()
|
||||
ast = parser.parse(tokens)
|
||||
return math_interpreter.interpret(ast)
|
||||
return matheval.interpreter.interpret(ast)
|
||||
except Exception as e:
|
||||
utils.print_error("Tool call evaluation failed. - " + str(e))
|
||||
return "Tool call evaluation failed."
|
||||
@@ -58,10 +56,10 @@ Args:
|
||||
expression = "solve " + " and ".join(equations) + " for " + " and ".join(variables)
|
||||
print(expression)
|
||||
|
||||
tokens = math_lexer.tokenize(expression)
|
||||
parser = math_ast.Parser()
|
||||
tokens = matheval.lexer.tokenize(expression)
|
||||
parser = ast.Parser()
|
||||
ast = parser.parse(tokens)
|
||||
return math_interpreter.interpret(ast)
|
||||
return matheval.interpreter.interpret(ast)
|
||||
except Exception as e:
|
||||
utils.print_error("Tool call evaluation failed. - " + str(e))
|
||||
return "Tool call evaluation failed."
|
@@ -2,7 +2,7 @@
|
||||
from typing import Callable, List, Optional
|
||||
import json
|
||||
import re
|
||||
import utils
|
||||
import chatbug.utils as utils
|
||||
|
||||
tool_list = []
|
||||
|
0
chatbug/ui/__init__.py
Normal file
0
chatbug/ui/__init__.py
Normal file
20
chatbug/ui/__main__.py
Normal file
20
chatbug/ui/__main__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
|
||||
from .server import start_server
|
||||
from .serverwait import wait_for_server
|
||||
from .ui import start_ui, _start_sandboxed
|
||||
|
||||
|
||||
def start_ui():
|
||||
svr = start_server(start_thread=False)
|
||||
url = f"http://localhost:{svr.port}"
|
||||
# wait_for_server(url)
|
||||
# # start_ui(threaded=False)
|
||||
# import webview
|
||||
# w = webview.create_window('asdf', '../../web/index.html', min_size=(1200, 900), zoomable=True)
|
||||
# webview.start(ssl=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_ui()
|
||||
|
||||
|
3771
chatbug/ui/bottle.py
Normal file
3771
chatbug/ui/bottle.py
Normal file
File diff suppressed because it is too large
Load Diff
50
chatbug/ui/bottle_svr.py
Normal file
50
chatbug/ui/bottle_svr.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#tornado needs this or it does not run
|
||||
import asyncio
|
||||
try:
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||
except AttributeError:
|
||||
print("Probably running on linux")
|
||||
|
||||
from bottle import route, run, response, static_file, request, post
|
||||
from .file_watchdog import FileWatchdog
|
||||
|
||||
|
||||
class BottleServer:
|
||||
|
||||
def __init__(self, listen="0.0.0.0", port=8080, start_thread=True, root="web"):
|
||||
|
||||
self.root = root
|
||||
|
||||
self.port = port
|
||||
self.listen = listen
|
||||
self.wdt = FileWatchdog(self.root)
|
||||
|
||||
if start_thread:
|
||||
import threading
|
||||
self.thread = threading.Thread(target=self._run, args=())
|
||||
self.thread.name = "BottleServerThread"
|
||||
self.thread.daemon = True
|
||||
self.thread.start()
|
||||
else:
|
||||
self._run()
|
||||
|
||||
def _home(self):
|
||||
return static_file("index.html", root= self.root)
|
||||
|
||||
def _watchdog(self):
|
||||
return str(self.wdt.time)
|
||||
|
||||
def _files(self, name):
|
||||
if name.endswith(".vue"):
|
||||
return static_file(name, root= self.root, mimetype="text/html")
|
||||
return static_file(name, root= self.root)
|
||||
|
||||
def _run(self):
|
||||
|
||||
route('/')(self._home)
|
||||
route('/watchdog')(self._watchdog)
|
||||
route('/<name:path>')(self._files)
|
||||
|
||||
print(f"Starting server at {self.listen}:{self.port}")
|
||||
run(host=self.listen, port=self.port, debug=False, threaded=True, quiet=True)
|
||||
|
47
chatbug/ui/file_watchdog.py
Normal file
47
chatbug/ui/file_watchdog.py
Normal file
@@ -0,0 +1,47 @@
|
||||
|
||||
import time
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
|
||||
class MyHandler(FileSystemEventHandler):
|
||||
def __init__(self, function):
|
||||
self.function = function
|
||||
|
||||
def on_any_event(self, _event):
|
||||
# Handle the event (e.g., file created, modified, deleted)
|
||||
self.function()
|
||||
|
||||
|
||||
class FileWatchdog:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.time = 0
|
||||
|
||||
event_handler = MyHandler(lambda: self.event_handler())
|
||||
|
||||
self.observer = Observer()
|
||||
self.observer.schedule(event_handler, path, recursive=True)
|
||||
self.observer.start()
|
||||
|
||||
def event_handler(self):
|
||||
#print("change detected")
|
||||
self.time = time.time()
|
||||
|
||||
def stop(self):
|
||||
self.observer.stop()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
wdt = FileWatchdog("./web")
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
print(wdt.time)
|
||||
except KeyboardInterrupt:
|
||||
wdt.stop()
|
||||
|
10
chatbug/ui/server.py
Normal file
10
chatbug/ui/server.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from .bottle_svr import BottleServer
|
||||
|
||||
|
||||
def start_server(start_thread=False):
|
||||
print("server start")
|
||||
return BottleServer(start_thread=start_thread, root="web")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_server()
|
29
chatbug/ui/serverwait.py
Normal file
29
chatbug/ui/serverwait.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import time
|
||||
import requests
|
||||
import socket
|
||||
|
||||
|
||||
|
||||
def wait_for_server(url, timeout=10, retry_interval=0.5):
|
||||
"""
|
||||
Waits for a web server to become available by polling its URL.
|
||||
"""
|
||||
|
||||
start_time = time.monotonic()
|
||||
while time.monotonic() - start_time < timeout:
|
||||
try:
|
||||
# First, try a simple TCP connection to check if the port is open
|
||||
hostname, port = url.split("//")[1].split(":")
|
||||
port = int(port)
|
||||
with socket.create_connection((hostname, port), timeout=retry_interval):
|
||||
pass # If the connection succeeds, continue to the HTTP check
|
||||
|
||||
# Then, make an HTTP request to ensure the server is responding correctly
|
||||
response = requests.get(url, timeout=retry_interval)
|
||||
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
|
||||
return # Server is up and responding correctly
|
||||
except (requests.exceptions.RequestException, socket.error) as e:
|
||||
print(f"Server not yet available: {e}. Retrying in {retry_interval} seconds...")
|
||||
time.sleep(retry_interval)
|
||||
|
||||
raise TimeoutError(f"Server at {url} did not become available within {timeout} seconds.")
|
30
chatbug/ui/ui.py
Normal file
30
chatbug/ui/ui.py
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
|
||||
import webview
|
||||
from threading import Thread
|
||||
|
||||
|
||||
def start_ui(threaded=False):
|
||||
if threaded:
|
||||
_start_threaded()
|
||||
else:
|
||||
_start_normal()
|
||||
|
||||
|
||||
def _start_threaded():
|
||||
t = Thread(target=start_ui, args=[False])
|
||||
t.run()
|
||||
|
||||
def _start_normal():
|
||||
webview.create_window('Geargenerator', 'http://localhost:8080', min_size=(1200, 900), zoomable=True)
|
||||
webview.start()
|
||||
|
||||
def _start_sandboxed():
|
||||
webview.create_window('Geargenerator', 'web_v2/geargenerator.html', min_size=(1200, 900), zoomable=True)
|
||||
webview.start(ssl=True)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_start_sandboxed()
|
||||
# start_ui(threaded=False)
|
175
llama.py
175
llama.py
@@ -1,175 +0,0 @@
|
||||
import time
|
||||
import random
|
||||
from tool_helper import tool_list, parse_and_execute_tool_call
|
||||
from tool_functions import register_dummy
|
||||
from inference import Inference, torch_reseed
|
||||
import datetime
|
||||
|
||||
|
||||
|
||||
messages = []
|
||||
inference = None
|
||||
|
||||
# systemmessage at the very begin of the chat. Will be concatenated with the automatic tool usage descriptions
|
||||
systemmessage = "Hold a casual conversation with the user. Keep responses short at max 3 sentences. Answer using markdown to the user."
|
||||
|
||||
# system message for role flip so the model automatically answers for the user
|
||||
roleflip = {"role": "system", "content": "Keep the conversation going, ask for more information on the subject. Keep messages short at max 1-2 sentences. Do not thank and say goodbye."}
|
||||
|
||||
# system messages and user message to bring the model to summarize the entire conversation
|
||||
summarize = {"role": "system", "content": "Summarize the conversation as a single, cohesive paragraph. Avoid using any bullet points, numbers, or list formatting. Write in plain text with natural sentences that flow together seamlessly."}
|
||||
summarize_user = {"role": "system", "content": "Can you summarize the conversation?"}
|
||||
|
||||
# system message to create a conversation title
|
||||
title_prompt = {"role": "system", "content": "Please create a very short and descriptive title or label for this conversation. Maximum 2-5 words. Use only plain text, avoid numbering, special characters, or unnecessary formatting-focus on clarity and brevity."}
|
||||
|
||||
|
||||
|
||||
register_dummy()
|
||||
|
||||
|
||||
|
||||
|
||||
def append_generate_chat(input_text: str, role="user"):
|
||||
t_start = time.time()
|
||||
|
||||
# generate AI response
|
||||
if input_text != None:
|
||||
messages.append({"role": role, "content": input_text})
|
||||
|
||||
inputs = inference.tokenize(messages, tokenize=True)
|
||||
|
||||
outputs, out_text = inference.generate(inputs)
|
||||
|
||||
# append result to message history
|
||||
messages.append({"role": "assistant", "content": out_text})
|
||||
|
||||
print("")
|
||||
print("generation took %.3fs (%d tokens)" % (time.time() - t_start, len(outputs[0])))
|
||||
|
||||
# handle tool call and check if a tool call has happened.
|
||||
tool_result = parse_and_execute_tool_call(out_text, tool_list)
|
||||
if tool_result != None:
|
||||
# tool call happened
|
||||
tool_result = "<tool_response>%s</tool_response>" % tool_result
|
||||
# depending on the chat template the tool response tags must or must not be passed. :(
|
||||
append_generate_chat(tool_result, role="tool")
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
global messages
|
||||
global inference
|
||||
|
||||
inference = Inference()
|
||||
|
||||
current_date_and_time = datetime.datetime.now().strftime("Current date is %Y-%m-%d and its %H:%M %p right now.")
|
||||
messages = [{"role": "system", "content": systemmessage + "\n" + current_date_and_time + "\n" + inference.generate_tool_use_header(tool_list)}]
|
||||
|
||||
while True:
|
||||
# print an input prompt to receive text or commands
|
||||
input_text = input(">>> ")
|
||||
print("")
|
||||
|
||||
|
||||
if input_text.startswith("!"):
|
||||
append_generate_chat("<tool_response>%s</tool_response>" % input_text[1:], role="tool")
|
||||
# append_generate_chat("%s" % input_text[1:], role="tool") # depending on the chat template the tool response tags must or must not be passed. :(
|
||||
|
||||
elif input_text.startswith("/clear"):
|
||||
print("clearing chat history")
|
||||
start_msg = messages[0]
|
||||
messages = [start_msg]
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/history"):
|
||||
history = inference.tokenize(messages, tokenize=False)
|
||||
# history = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False, add_generation_prompt=False)
|
||||
print(history)
|
||||
|
||||
elif input_text.startswith("/undo"):
|
||||
if len(messages) > 2:
|
||||
print("undo latest prompt")
|
||||
messages = messages[:-2]
|
||||
else:
|
||||
print("cannot undo because there are not enough messages on history.")
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/regen"):
|
||||
if len(messages) >= 2:
|
||||
print("regenerating message (not working)")
|
||||
messages = messages[:-1]
|
||||
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
||||
torch_reseed(seed)
|
||||
append_generate_chat(None)
|
||||
else:
|
||||
print("cannot regenerate because there are not enough messages on history.")
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/more"):
|
||||
append_generate_chat(None)
|
||||
|
||||
elif input_text.startswith("/file"):
|
||||
filename = input_text[len("/file "):]
|
||||
print("read '%s' for prompt:" % filename)
|
||||
with open(filename, "r") as f:
|
||||
content = f.read()
|
||||
print(content)
|
||||
append_generate_chat(content)
|
||||
|
||||
elif input_text.startswith("/auto"):
|
||||
messages_backup = messages
|
||||
messages = [roleflip]
|
||||
for m in messages_backup:
|
||||
role = m["role"]
|
||||
content = m["content"]
|
||||
if role == "user":
|
||||
role = "assistant"
|
||||
elif role == "assistant":
|
||||
role = "user"
|
||||
if role != "system":
|
||||
messages.append({"role": role, "content": content})
|
||||
append_generate_chat(None) # will automatically advance the conversation as 'user'
|
||||
last_message = messages[-1]
|
||||
last_message["role"] = "user"
|
||||
messages = messages_backup + [last_message]
|
||||
append_generate_chat(None) # 'regular' chatbot answer
|
||||
|
||||
elif input_text.startswith("/summarize"):
|
||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
||||
messages_temp = [summarize] + messages_temp + [summarize_user] # copy dict in last instance
|
||||
# messages_temp[-1]["role"] = "user"
|
||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="The conversation was about ")
|
||||
generated_tokens, full_output = inference.generate(input_ids)
|
||||
|
||||
elif input_text.startswith("/title"):
|
||||
messages_temp = list(filter(lambda x: x["role"] != "system", messages))
|
||||
messages_temp = [title_prompt] + messages_temp #+ [dict(title)] # copy dict in last instance
|
||||
messages_temp[-1]["role"] = "user"
|
||||
input_ids = inference.tokenize(messages_temp, tokenize=True, assistant_prefix="Title: ")
|
||||
generated_tokens, full_output = inference.generate(input_ids)
|
||||
|
||||
elif input_text.startswith("/help"):
|
||||
print("!<prompt> answer as 'tool' in <tool_response> tags")
|
||||
print("/clear clear chat history")
|
||||
print("/undo undo latest prompt")
|
||||
print("/regen regenerate the last message")
|
||||
print("/more generate more additional information")
|
||||
print("/file read prompt input from file")
|
||||
print("/auto automatically advance conversation")
|
||||
print("/summarize generate a summary of the chat")
|
||||
print("/title generate a title of the chat")
|
||||
print("/help print this message")
|
||||
print("")
|
||||
|
||||
elif input_text.startswith("/"):
|
||||
print("unknown command.")
|
||||
|
||||
else:
|
||||
append_generate_chat(input_text)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@@ -1,3 +1,4 @@
|
||||
transformers
|
||||
accelerate
|
||||
bitsandbytes
|
||||
bitsandbytes
|
||||
pytest
|
28
setup.py
Normal file
28
setup.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='chatbug',
|
||||
version='0.1.0',
|
||||
description='A conversational AI chatbot',
|
||||
author='Florin Tobler',
|
||||
author_email='florin.tobler@hotmail.com',
|
||||
packages=find_packages(exclude=["tests"]),
|
||||
install_requires=[
|
||||
'transformers',
|
||||
'accelerate',
|
||||
'bitsandbytes',
|
||||
'pytest',
|
||||
'pywebview',
|
||||
],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'chatbug=chatbug.llama:main_func',
|
||||
# a^ b^ c^ d^
|
||||
# a => the command line argument
|
||||
# b => the package name
|
||||
# c => the file name in the package (same as imports)
|
||||
# d => the function to call
|
||||
'chatbugui=chatbug.ui.__main__:start_ui',
|
||||
],
|
||||
},
|
||||
)
|
@@ -1 +0,0 @@
|
||||
# empty
|
@@ -1,32 +1,20 @@
|
||||
import pytest
|
||||
import tests.helper as helper
|
||||
from tests import helper
|
||||
|
||||
|
||||
inference = None
|
||||
InferenceClass = None
|
||||
Tensor = None
|
||||
|
||||
|
||||
def prepare():
|
||||
if InferenceClass == None:
|
||||
test_import_inference_module_librarys()
|
||||
if inference == None:
|
||||
test_instantiate_inference_instance()
|
||||
|
||||
|
||||
def test_import_inference_module_librarys():
|
||||
import inference
|
||||
import torch
|
||||
global InferenceClass
|
||||
global Tensor
|
||||
InferenceClass = inference.Inference
|
||||
Tensor = torch.Tensor
|
||||
|
||||
|
||||
def test_instantiate_inference_instance():
|
||||
if InferenceClass == None:
|
||||
test_import_inference_module_librarys()
|
||||
global inference
|
||||
inference = InferenceClass()
|
||||
global Tensor
|
||||
if inference == None:
|
||||
from torch import Tensor as _Tensor
|
||||
from chatbug.inference import Inference
|
||||
from chatbug.model_selection import get_model
|
||||
inference = Inference(get_model())
|
||||
Tensor = _Tensor
|
||||
|
||||
|
||||
def test_tool_header_generation():
|
||||
|
@@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
import tool_helper
|
||||
import tests.helper as helper
|
||||
import chatbug.tool_helper as tool_helper
|
||||
from tests import helper
|
||||
|
||||
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
import tool_functions
|
||||
|
||||
import chatbug.tool_functions as tool_functions
|
||||
from tests import helper
|
||||
|
||||
|
||||
def test_math_evaluate_1():
|
||||
@@ -28,6 +28,13 @@ def test_math_evaluate_5():
|
||||
result = tool_functions.math_evaluate("sin(pi/2) + cos(0)")
|
||||
assert result == "sin(pi/2) + cos(0) = 2"
|
||||
|
||||
def test_math_evaluate_solve_a():
|
||||
result = tool_functions.math_evaluate("solve 240=x*r+x*r^2+x*r^3+s and r=1.618 and s=5 for x, r, s")
|
||||
assert result == "Solved equation system 240 = r**3*x + r**2*x + r*x + s, r = 1.61800000000000 and s = 5 for x=27.7393327937747=~27.739, r=1.61800000000000=~1.618 and s=5.00000000000000=~5.000."
|
||||
|
||||
def test_math_evaluate_solve_b():
|
||||
result = tool_functions.math_evaluate("solve 250=x+x*r+s and r=1.618 and s=0 for x, r, s")
|
||||
assert result == "Solved equation system 250 = r*x + s + x, r = 1.61800000000000 and s = 0 for x=95.4927425515661=~95.493, r=1.61800000000000=~1.618 and s=0."
|
||||
|
||||
|
||||
|
||||
@@ -54,4 +61,3 @@ def test_math_solver_3b():
|
||||
def test_math_solver_4():
|
||||
result = tool_functions.math_evaluate("solve 2*x**3 + 3*y = 7 and x - y = 1 for x, y")
|
||||
assert result == "Solved equation system 2*x**3 + 3*y = 7 and x - y = 1 for x=~1.421 and y=~0.421."
|
||||
|
||||
|
@@ -1,7 +1,8 @@
|
||||
import pytest
|
||||
import tool_helper
|
||||
from chatbug import tool_helper
|
||||
from unittest import mock
|
||||
import tests.helper as helper
|
||||
from tests import helper
|
||||
import re
|
||||
|
||||
|
||||
|
||||
@@ -40,34 +41,34 @@ def test_match_and_extract_matching3_with_newline():
|
||||
|
||||
|
||||
def test_string_malformed_faulty():
|
||||
with mock.patch("utils.print_error") as print_error_mock:
|
||||
with mock.patch("chatbug.utils.print_error") as print_error_mock:
|
||||
result = tool_helper._execute_tool_call_str("{json_content}", [])
|
||||
assert result == None
|
||||
print_error_mock.assert_called_once() # this will check if the mocked function on the context was called.
|
||||
|
||||
|
||||
def test_tool_call_json_1():
|
||||
with mock.patch("utils.print_error") as print_error_mock:
|
||||
with mock.patch("chatbug.utils.print_error") as print_error_mock:
|
||||
result = tool_helper._execute_tool_call_json({"name": "tool_dummy", "arguments": {"a": 1, "b": "zwei"}}, [helper.tool_dummy, helper.tool_dummy2])
|
||||
assert result == "result_1_zwei"
|
||||
assert print_error_mock.call_count == 0
|
||||
|
||||
|
||||
def test_tool_call_json_2():
|
||||
with mock.patch("utils.print_error") as print_error_mock:
|
||||
with mock.patch("chatbug.utils.print_error") as print_error_mock:
|
||||
result = tool_helper._execute_tool_call_json({"name": "tool_dummy2", "arguments": {"text": "some_text"}}, [helper.tool_dummy, helper.tool_dummy2])
|
||||
assert result == "SOME_TEXT"
|
||||
assert print_error_mock.call_count == 0
|
||||
|
||||
|
||||
def test_tool_call_json_non_existing_call_check():
|
||||
with mock.patch("utils.print_error") as print_error_mock:
|
||||
with mock.patch("chatbug.utils.print_error") as print_error_mock:
|
||||
result = tool_helper._execute_tool_call_json({"name": "tool_dummy_which_is_not_existing", "arguments": {"text": "some_text"}}, [helper.tool_dummy, helper.tool_dummy2])
|
||||
assert result == None
|
||||
assert print_error_mock.call_count == 1 # this will check if the mocked function on the context was called.
|
||||
|
||||
def test_tool_call_json_wrong_arguments_check():
|
||||
with mock.patch("utils.print_error") as print_error_mock:
|
||||
with mock.patch("chatbug.utils.print_error") as print_error_mock:
|
||||
result = tool_helper._execute_tool_call_json({"name": "tool_dummy", "arguments": {"a": "must_be_an_int_but_is_string", "b": "zwei"}}, [helper.tool_dummy, helper.tool_dummy2])
|
||||
assert result == None
|
||||
assert print_error_mock.call_count == 1 # this will check if the mocked function on the context was called.
|
||||
@@ -75,7 +76,6 @@ def test_tool_call_json_wrong_arguments_check():
|
||||
|
||||
|
||||
def test_regex_multiline():
|
||||
import re
|
||||
pattern = r"<start>(.*)</end>"
|
||||
|
||||
# The text to search (spanning multiple lines)
|
||||
|
61
web/index.html
Normal file
61
web/index.html
Normal file
@@ -0,0 +1,61 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<!-- <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script> -->
|
||||
<link rel="stylesheet" href="stylesheet.css">
|
||||
<script src="alpine.min.js"></script>
|
||||
<script src="main.js"></script>
|
||||
<script src="watchdog.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="sidebar">
|
||||
<h1>Chatbug 🪲</h1>
|
||||
<div class="button">🐛 New Chat</div>
|
||||
<div class="title">Today</div>
|
||||
<div class="button">Building Web UI with Bottle & Alpine.js</div>
|
||||
<div class="button">Coding in python</div>
|
||||
<div class="title">Last Week</div>
|
||||
<div class="title">Older</div>
|
||||
</div>
|
||||
<div class="mainarea">
|
||||
<!-- <h1 x-data="{ message: 'I ❤️ Alpine' }" x-text="message"></h1> -->
|
||||
|
||||
<div class="message">
|
||||
<div class="bubble">Hello world</div>
|
||||
</div>
|
||||
<div class="response">
|
||||
<div class="">Hello! Nice to meet you. What's up?</div>
|
||||
</div>
|
||||
<div class="message">
|
||||
<div class="bubble">ah, just holding an example conversation with you</div>
|
||||
</div>
|
||||
<div class="response">
|
||||
<div class="">Got it! Fun stuff. What kind of projects are you working on these days?</div>
|
||||
</div>
|
||||
<div class="message">
|
||||
<div class="bubble">LLM chatbot named chatbug 🪲</div>
|
||||
</div>
|
||||
<div class="response">
|
||||
<div class="">Cool name! Chatbug sounds like a friendly one. How's it going?</div>
|
||||
</div>
|
||||
<div class="message">
|
||||
<div class="bubble">making a web ui with bottle and alpinejs</div>
|
||||
</div>
|
||||
|
||||
<div class="input">
|
||||
<!-- toolbutton for tool submenu, normally hidden unless pressed -->
|
||||
<div class="button">+</div>
|
||||
<div class="tool list" style="display:none">
|
||||
<div class="tool button">attach file</div>
|
||||
<div class="tool button">regenerate</div>
|
||||
<div class="tool button">undo</div>
|
||||
</div>
|
||||
<input type="text">
|
||||
<!-- send -->
|
||||
<div class="button">↗</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
25
web/main.js
Normal file
25
web/main.js
Normal file
@@ -0,0 +1,25 @@
|
||||
// import {createApp, ref, reactive} from 'vue';
|
||||
|
||||
|
||||
|
||||
|
||||
// const app = createApp({
|
||||
// data() {
|
||||
|
||||
// let msg = ref("hello world")
|
||||
|
||||
// try {
|
||||
// msg.value = "" + pywebview.api
|
||||
// } catch (e) {
|
||||
// msg.value = "did not invoke " + e
|
||||
// }
|
||||
|
||||
// window.msg = msg
|
||||
// return {
|
||||
// message: msg
|
||||
// };
|
||||
// }
|
||||
// });
|
||||
// app.mount('#app');
|
||||
|
||||
|
117
web/stylesheet.css
Normal file
117
web/stylesheet.css
Normal file
@@ -0,0 +1,117 @@
|
||||
body {
|
||||
background-color: black;
|
||||
color: white;
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
margin: 0px;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
|
||||
.sidebar {
|
||||
width: 250px;
|
||||
background-color: #2a262a;
|
||||
float: left;
|
||||
height: 100%;
|
||||
position: absolute;
|
||||
}
|
||||
|
||||
.sidebar h1 {
|
||||
margin: 20px;
|
||||
}
|
||||
|
||||
|
||||
.sidebar .title {
|
||||
font-size: 8pt;
|
||||
margin: 20px;
|
||||
margin-top: 30px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.sidebar .button {
|
||||
margin-left: 10px;
|
||||
margin-right: 10px;
|
||||
padding: 10px;
|
||||
border-radius: 10px;
|
||||
}
|
||||
.sidebar .button:hover {
|
||||
background-color: #423a42;
|
||||
}
|
||||
|
||||
.mainarea {
|
||||
margin-left: 260px;
|
||||
height: 100%;
|
||||
position: absolute;
|
||||
right: 0;
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.message {
|
||||
display: flex;
|
||||
margin-left: 40px;
|
||||
margin-right: 10px;
|
||||
|
||||
}
|
||||
|
||||
.bubble {
|
||||
padding: 10px;
|
||||
border-radius: 10px;
|
||||
background-color: #416146;
|
||||
margin-left: auto;
|
||||
float: right;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.response {
|
||||
display: flex;
|
||||
margin: 30px;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.response::before {
|
||||
content: '🪲';
|
||||
position: absolute;
|
||||
top: -4px;
|
||||
left: -30px;
|
||||
}
|
||||
|
||||
|
||||
.input {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 10px;
|
||||
background-color: #2a262a;
|
||||
border-radius: 10px;
|
||||
width: 70%;
|
||||
margin: auto;
|
||||
position: absolute;
|
||||
bottom: 40px;
|
||||
}
|
||||
|
||||
.tool.list {
|
||||
display: none;
|
||||
background-color: #fff;
|
||||
border: 1px solid #ccc;
|
||||
position: absolute;
|
||||
top: 100%;
|
||||
left: 0;
|
||||
z-index: 1;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.2);
|
||||
}
|
||||
|
||||
.tool.button {
|
||||
cursor: pointer;
|
||||
padding: 5px 10px;
|
||||
margin: 5px;
|
||||
}
|
||||
|
||||
.input input {
|
||||
flex-grow: 1;
|
||||
padding: 10px;
|
||||
border: 0px solid #ccc;
|
||||
background: none;
|
||||
color: white;
|
||||
}
|
||||
.input input:focus {
|
||||
outline: 0px solid black; /* Custom focus outline */
|
||||
|
||||
}
|
67
web/watchdog.js
Normal file
67
web/watchdog.js
Normal file
@@ -0,0 +1,67 @@
|
||||
|
||||
|
||||
wdt = {
|
||||
last_wdt_time: 0,
|
||||
watchdog_counter: 0
|
||||
}
|
||||
|
||||
pollFileChange = () => {
|
||||
setTimeout(() => {
|
||||
wdt.watchdog_counter++
|
||||
console.log(wdt.watchdog_counter)
|
||||
if (wdt.watchdog_counter > 20) {
|
||||
return
|
||||
}
|
||||
ajax({
|
||||
type: "GET",
|
||||
url: "/watchdog",
|
||||
success: (data) => {
|
||||
var time = Number(data)
|
||||
if (wdt.last_wdt_time == 0) {
|
||||
wdt.last_wdt_time = time
|
||||
pollFileChange()
|
||||
} else if (time > wdt.last_wdt_time) {
|
||||
location.reload();
|
||||
} else {
|
||||
pollFileChange()
|
||||
}
|
||||
},
|
||||
})
|
||||
}, 10000)
|
||||
}
|
||||
|
||||
function ajax(setting) {
|
||||
if (typeof(shutdown) !== 'undefined') return
|
||||
var request = new XMLHttpRequest();
|
||||
request.open(setting.type, setting.url, true);
|
||||
request.setRequestHeader('Content-Type', setting.dataType)
|
||||
request.onload = function(data) {
|
||||
if (typeof(shutdown) !== 'undefined') return
|
||||
if (this.status >= 200 && this.status < 400) {
|
||||
if (setting.success) {
|
||||
setting.success(this.response)
|
||||
}
|
||||
} else {
|
||||
if (setting.error) {
|
||||
setting.error(this.response)
|
||||
}
|
||||
}
|
||||
}
|
||||
request.onerror = function(data) {
|
||||
if (typeof(shutdown) !== 'undefined') return
|
||||
if (setting.error) {
|
||||
setting.error(data)
|
||||
}
|
||||
}
|
||||
if (setting.data) {
|
||||
request.send(setting.data)
|
||||
} else {
|
||||
request.send()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pollFileChange()
|
||||
|
||||
|
Reference in New Issue
Block a user