diff --git a/solo_server/commands/aid.py b/solo_server/commands/aid.py new file mode 100644 index 0000000..60b94a9 --- /dev/null +++ b/solo_server/commands/aid.py @@ -0,0 +1,39 @@ +import sys +import typer +from litgpt import LLM +from rich.console import Console + +console = Console() + +def query_llm(query: str): + # Check if the query exceeds 9000 characters + if len(query) > 9000: + typer.echo("Error: Your query exceeds the maximum allowed length of 9000 characters. It's over 9000!") + raise typer.Exit(1) + + # Load the model and generate a response while showing a spinner + llm = LLM.load("Qwen/Qwen2.5-1.5B-Instruct") + with console.status("Generating response...", spinner="dots"): + response = llm.generate(query) + typer.echo(response) + +def interactive_mode(): + console.print("Interactive Mode (type 'exit' or 'quit' to end):", style="bold green") + while True: + query_text = input(">> ") + if query_text.lower() in ("exit", "quit"): + break + query_llm(query_text) + +if __name__ == "__main__": + # If invoked with "@@" as the first argument, treat the rest as the query. + # Otherwise, launch interactive mode. + if len(sys.argv) > 1 and sys.argv[1] == "@@": + if len(sys.argv) > 2: + query_text = " ".join(sys.argv[2:]) + else: + typer.echo("Enter your query (end with EOF / Ctrl-D):") + query_text = sys.stdin.read().strip() + query_llm(query_text) + else: + interactive_mode() diff --git a/solo_server/solo.ensemble.yaml b/solo_server/solo.ensemble.yaml new file mode 100644 index 0000000..23503e0 --- /dev/null +++ b/solo_server/solo.ensemble.yaml @@ -0,0 +1,46 @@ +system_information: + operating_system: "Windows" + cpu: "AMD64 Family 23 Model 96 Stepping 1, AuthenticAMD" + cpu_cores: 8 + memory: "15.42GB" + gpu: + vendor: "NVIDIA" + model: "NVIDIA GeForce GTX 1660 Ti" + memory: "6144MB" + compute_backend: "CUDA" + +server_options: + - name: "Ollama" + recommended: true + details: "Optimized for systems with NVIDIA GPUs and CUDA support. (Recommended for your system.)" + - name: "vLLM" + recommended: false + details: "High-performance inference engine, best suited for Linux environments." + - name: "Llama.cpp" + recommended: false + details: "Lightweight and cross-platform; runs efficiently on CPU-only systems." + - name: "LitGPT" + recommended: false + details: "Lightnight AI based PyTorch implementation." + +default_server: "Ollama" + +models: + solo-core-model: + model: "Qwen/Qwen2.5-1.5B-Instruct" + description: "Primary general-purpose model." + coding: + model: "qwen2.5-3b-coder" + description: "Optimized for code generation and programming tasks." + chat: + model: "deepseekr1-instruct-distill" + description: "Fine-tuned for conversational and chat applications." + robots: + model: "ottonomy-distill" + description: "Targeted for robotics and automation-related tasks." + healthcare_classification: + model: "palm" + description: "Optimized for healthcare data classification and analysis." + general: + model: "Qwen/Qwen2.5-1.5B-Instruct" + description: "Primary general-purpose model." \ No newline at end of file