From 2b3afd87e9163e2001e9c30e46b1b7806ecc6417 Mon Sep 17 00:00:00 2001 From: ddiddi Date: Thu, 6 Mar 2025 13:41:38 -0800 Subject: [PATCH 1/3] feat: add ability to get aid from base solo coder --- solo_server/commands/aid.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 solo_server/commands/aid.py diff --git a/solo_server/commands/aid.py b/solo_server/commands/aid.py new file mode 100644 index 0000000..702fae2 --- /dev/null +++ b/solo_server/commands/aid.py @@ -0,0 +1,29 @@ +import sys +import subprocess +import typer + +def aid (query: str): + # Check if docker is running + try: + subprocess.run(["docker", "ps"], capture_output=True, check=True) + except subprocess.CalledProcessError: + typer.echo("Solo server not running. Please start solo-server first.") + return + + # Execute the query in the solo-ollama container. + try: + result = subprocess.run( + ["docker", "exec", "solo-ollama", "ollama", "ask", query], + capture_output=True, text=True, check=True + ) + typer.echo(result.stdout) + except subprocess.CalledProcessError: + typer.echo("Failed to get response from the LLM.") + +if __name__ == "__main__": + # If invoked with ">>" as the first argument, join the remaining tokens as the query. + if len(sys.argv) > 1 and sys.argv[1] == ">>": + query_text = " ".join(sys.argv[2:]) + aid(query_text) + else: + typer.echo("Usage: solo >> ") From eea8e2edabd73de3390417a0cc65b1a522a54908 Mon Sep 17 00:00:00 2001 From: ddiddi Date: Thu, 6 Mar 2025 13:57:50 -0800 Subject: [PATCH 2/3] add @@ token for prompts --- solo_server/commands/aid.py | 54 ++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/solo_server/commands/aid.py b/solo_server/commands/aid.py index 702fae2..60b94a9 100644 --- a/solo_server/commands/aid.py +++ b/solo_server/commands/aid.py @@ -1,29 +1,39 @@ import sys -import subprocess import typer +from litgpt import LLM +from rich.console import Console -def aid (query: str): - # Check if docker is running - try: - subprocess.run(["docker", "ps"], capture_output=True, check=True) - except subprocess.CalledProcessError: - typer.echo("Solo server not running. Please start solo-server first.") - return +console = Console() + +def query_llm(query: str): + # Check if the query exceeds 9000 characters + if len(query) > 9000: + typer.echo("Error: Your query exceeds the maximum allowed length of 9000 characters. It's over 9000!") + raise typer.Exit(1) - # Execute the query in the solo-ollama container. - try: - result = subprocess.run( - ["docker", "exec", "solo-ollama", "ollama", "ask", query], - capture_output=True, text=True, check=True - ) - typer.echo(result.stdout) - except subprocess.CalledProcessError: - typer.echo("Failed to get response from the LLM.") + # Load the model and generate a response while showing a spinner + llm = LLM.load("Qwen/Qwen2.5-1.5B-Instruct") + with console.status("Generating response...", spinner="dots"): + response = llm.generate(query) + typer.echo(response) + +def interactive_mode(): + console.print("Interactive Mode (type 'exit' or 'quit' to end):", style="bold green") + while True: + query_text = input(">> ") + if query_text.lower() in ("exit", "quit"): + break + query_llm(query_text) if __name__ == "__main__": - # If invoked with ">>" as the first argument, join the remaining tokens as the query. - if len(sys.argv) > 1 and sys.argv[1] == ">>": - query_text = " ".join(sys.argv[2:]) - aid(query_text) + # If invoked with "@@" as the first argument, treat the rest as the query. + # Otherwise, launch interactive mode. + if len(sys.argv) > 1 and sys.argv[1] == "@@": + if len(sys.argv) > 2: + query_text = " ".join(sys.argv[2:]) + else: + typer.echo("Enter your query (end with EOF / Ctrl-D):") + query_text = sys.stdin.read().strip() + query_llm(query_text) else: - typer.echo("Usage: solo >> ") + interactive_mode() From 3b76bad4497d2e46a4031df0486e9d4cd3b9afa7 Mon Sep 17 00:00:00 2001 From: ddiddi Date: Thu, 6 Mar 2025 14:03:19 -0800 Subject: [PATCH 3/3] add solo active ensemble file --- solo_server/solo.ensemble.yaml | 46 ++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 solo_server/solo.ensemble.yaml diff --git a/solo_server/solo.ensemble.yaml b/solo_server/solo.ensemble.yaml new file mode 100644 index 0000000..23503e0 --- /dev/null +++ b/solo_server/solo.ensemble.yaml @@ -0,0 +1,46 @@ +system_information: + operating_system: "Windows" + cpu: "AMD64 Family 23 Model 96 Stepping 1, AuthenticAMD" + cpu_cores: 8 + memory: "15.42GB" + gpu: + vendor: "NVIDIA" + model: "NVIDIA GeForce GTX 1660 Ti" + memory: "6144MB" + compute_backend: "CUDA" + +server_options: + - name: "Ollama" + recommended: true + details: "Optimized for systems with NVIDIA GPUs and CUDA support. (Recommended for your system.)" + - name: "vLLM" + recommended: false + details: "High-performance inference engine, best suited for Linux environments." + - name: "Llama.cpp" + recommended: false + details: "Lightweight and cross-platform; runs efficiently on CPU-only systems." + - name: "LitGPT" + recommended: false + details: "Lightnight AI based PyTorch implementation." + +default_server: "Ollama" + +models: + solo-core-model: + model: "Qwen/Qwen2.5-1.5B-Instruct" + description: "Primary general-purpose model." + coding: + model: "qwen2.5-3b-coder" + description: "Optimized for code generation and programming tasks." + chat: + model: "deepseekr1-instruct-distill" + description: "Fine-tuned for conversational and chat applications." + robots: + model: "ottonomy-distill" + description: "Targeted for robotics and automation-related tasks." + healthcare_classification: + model: "palm" + description: "Optimized for healthcare data classification and analysis." + general: + model: "Qwen/Qwen2.5-1.5B-Instruct" + description: "Primary general-purpose model." \ No newline at end of file