From 0f516477e9afc433744afd25064a2ad8360a9286 Mon Sep 17 00:00:00 2001
From: Zeeshaan Mohammed <zeeshaanuddin6@gmail.com>
Date: Tue, 4 Feb 2025 20:13:31 -0800
Subject: [PATCH 1/4] updated solo-server

---
 README.md                      |  78 ++++++++++--
 setup.py                       |   7 +-
 solo_server/cli.py             |  11 +-
 solo_server/commands/pull.py   |  26 ----
 solo_server/commands/run.py    |  26 ++++
 solo_server/commands/serve.py  |  56 ++++++---
 solo_server/commands/status.py |   8 +-
 solo_server/commands/stop.py   |  29 ++++-
 solo_server/setup.py           | 224 ++++++++++++---------------------
 solo_server/utils/__init__.py  |   1 +
 solo_server/utils/hardware.py  |  75 +++++++++++
 11 files changed, 328 insertions(+), 213 deletions(-)
 delete mode 100644 solo_server/commands/pull.py
 create mode 100644 solo_server/commands/run.py
 create mode 100644 solo_server/utils/__init__.py
 create mode 100644 solo_server/utils/hardware.py

diff --git a/README.md b/README.md
index 8496ae4..bbdae23 100644
--- a/README.md
+++ b/README.md
@@ -20,13 +20,13 @@ Solo Server is a lightweight platform that enables users to manage and monitor A
 ## Features
 
 - **Seamless Setup:** Manage your on device AI with a simple CLI and HTTP servers
-- **Open Model Registry:** Pull models from registries like Hugging Face and Ollama
+- **Open Model Registry:** Pull models from registries like  Ollama & Hugging Face
 - **Lean Load Testing:** Built-in commands to benchmark endpoints
 - **Cross-Platform Compatibility:** Deploy AI models effortlessly on your hardware
 - **Configurable Framework:** Auto-detect hardware (CPU, GPU, RAM) and sets configs
 
 ## Supported Models
-Solo Server supports **multiple model sources**, including **Ollama, Hugging Face, and Ramalama**.
+Solo Server supports **multiple model sources**, including **Ollama & Hugging Face**.
 
 | **Model Name**         | **Source**                                                |
 |------------------------|----------------------------------------------------------|
@@ -39,7 +39,7 @@ Solo Server supports **multiple model sources**, including **Ollama, Hugging Fac
 | **Mistral 7B v3**      | `hf://MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF`       |
 | **Hermes 2 Pro**       | `hf://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`        |
 | **Cerebrum 1.0 7B**    | `hf://froggeric/Cerebrum-1.0-7b-GGUF`                    |
-| **Dragon Mistral 7B**  | `hf://llmware/dragon-mistral-7b-v0`  
+| **Dragon Mistral 7B**  | `hf://llmware/dragon-mistral-7b-v0`                      |
 
 ## Table of Contents
 
@@ -52,6 +52,12 @@ Solo Server supports **multiple model sources**, including **Ollama, Hugging Fac
 
 ## Installation
 
+### **🔹Prerequisites** 
+
+- **🐋 Docker:** Required for containerization 
+  - [Install Docker](https://docs.docker.com/get-docker/)
+  - Ensure Docker daemon is running
+
 ### **🔹 Install via PyPI**
 ```sh
 pip install solo-server
@@ -65,22 +71,39 @@ Creates an isolated environment using `uv` for performance and stability.
 
 Run the **interactive setup** to configure Solo Server:
 ```sh
-solo setup
+solo start
 ```
 ### **🔹 Setup Features**
 ✔️ **Detects CPU, GPU, RAM** for **hardware-optimized execution**  
 ✔️ **Auto-configures `solo.conf` with optimal settings**  
-✔️ **Requests API keys for Ngrok and Replicatea**  
+✔️ **Requests API keys for Ngrok and Replicate**  
 ✔️ **Recommends the compute backend OCI (CUDA, HIP, SYCL, Vulkan, CPU, Metal)**  
 
 ---
 
+**Example Output:**
+```sh
+🖥️  System Information
+Operating System: Windows
+CPU: AMD64 Family 23 Model 96 Stepping 1, AuthenticAMD
+CPU Cores: 8
+Memory: 15.42GB
+GPU: NVIDIA
+GPU Model: NVIDIA GeForce GTX 1660 Ti
+GPU Memory: 6144.0GB
+Compute Backend: CUDA
+
+🚀 Setting up Solo Server...
+✅ Solo server is ready!
+```
+
+---
+
 ## **Commands**
-### **1️⃣ Pull a Model**
+### **1️⃣ Pull & Run a Model**
 ```sh
-solo pull llama3
+solo run llama3.2
 ```
- 
 
 ---
 
@@ -96,6 +119,39 @@ http://127.0.0.1:5070  #SOLO_SERVER_PORT
 
 ---
 
+## Diagram
+
+```
++-------------------+
+|                   |
+| solo run llama3.2 |
+|                   |
++---------+---------+
+	      |
+	      |
+          |           +------------------+           +----------------------+
+          |           | Pull inferencing |           |   Pull model layer   |
+          +-----------| runtime (cuda)   |---------->|       llama3.2       | 
+                      +------------------+           +----------------------+
+                                                     |     Repo options     |
+                                                     ++-----------+--------++
+                                                      |           |        |
+                                                      v           v        v
+                                                +----------+ +----------+ +-------------+
+                                                | Ollama   | | vLLM     | | HuggingFace |
+                                                | Registry | | registry | |  Registry   |
+                                                +-----+------+---+------+-++------------+
+                                                      |          |         |
+                                                      v          v         v
+                                                      +---------------------+
+                                                      |   Start with        |
+                                                      |   cuda runtime      |
+                                                      |   and               |
+                                                      |   llama3.2          |
+                                                      +---------------------+
+```
+---
+
 ### **3️⃣ Benchmark a Model**
 ```sh
 solo benchmark llama3
@@ -148,12 +204,12 @@ solo status
 
 ### **5️⃣ Stop a Model**
 ```sh
-solo stop llama3
+solo stop 
 ```
 **Example Output:**
 ```sh
-Stopping llama3...
-llama3 stopped successfully.
+🛑 Stopping Solo Server...
+✅ Solo server stopped successfully.
 ```
 
 ---
diff --git a/setup.py b/setup.py
index 3c66570..4206c92 100644
--- a/setup.py
+++ b/setup.py
@@ -11,11 +11,14 @@
     description="AIOps for the Physical World.",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/AIEngineersDev/solo-server",
+    url="https://github.com/GetSoloTech/solo-server",
     packages=find_packages(include=["solo_server", "solo_server.*"]),
     include_package_data=True,
     install_requires=[
         "typer",
+        "GPUtil",
+        "psutil",
+        "requests",  
     ],
     extras_require={
         "dev": ["pytest", "black", "isort"],
@@ -23,7 +26,7 @@
     python_requires=">=3.8",
     entry_points={
         "console_scripts": [
-            "solo-server=solo_server.cli:app",
+            "solo=solo_server.cli:app",
         ],
     },
 )
\ No newline at end of file
diff --git a/solo_server/cli.py b/solo_server/cli.py
index 0d99c03..bd07026 100644
--- a/solo_server/cli.py
+++ b/solo_server/cli.py
@@ -1,15 +1,14 @@
 import typer
-from .commands import pull, serve, stop, status, benchmark
-from .setup import interactive_setup    
+from .commands import run, serve, stop, status
+from .setup import start    
 app = typer.Typer()
 
 # Commands
-app.command()(pull.pull)
-app.command()(serve.serve)
+app.command()(run.run)
 app.command()(stop.stop)
 app.command()(status.status)
-app.command()(benchmark.benchmark)
-app.command()(interactive_setup)
+app.command()(serve.serve)
+app.command()(start)
 
 if __name__ == "__main__":
     app()
diff --git a/solo_server/commands/pull.py b/solo_server/commands/pull.py
deleted file mode 100644
index ef84bab..0000000
--- a/solo_server/commands/pull.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import typer
-import subprocess
-
-def pull(model: str):
-    """
-    Pulls a model using Ramalama registry.
-    """
-    typer.echo(f"🔄 Pulling model {model} from Ramalama registry...")
-
-    try:
-        command = ["ramalama", "pull", model]
-        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-
-        # Progress tracking
-        for line in process.stdout:
-            typer.echo(line.strip())
-
-        process.wait()
-
-        if process.returncode == 0:
-            typer.echo(f"✅ Model {model} pulled successfully.")
-        else:
-            typer.echo(f"❌ Failed to pull model {model}.", err=True)
-
-    except Exception as e:
-        typer.echo(f"⚠️ Error pulling model {model}: {e}", err=True)
diff --git a/solo_server/commands/run.py b/solo_server/commands/run.py
new file mode 100644
index 0000000..259b566
--- /dev/null
+++ b/solo_server/commands/run.py
@@ -0,0 +1,26 @@
+import typer
+import subprocess
+
+def run(model: str):
+    """
+    Serves a model using Ollama and enables interactive chat.
+    """
+    typer.echo(f"🚀 Starting model {model}...")
+
+    # Check if Docker container is running
+    try:
+        check_cmd = ["docker", "ps", "-q", "-f", "name=ollama"]
+        if not subprocess.run(check_cmd, capture_output=True, text=True).stdout:
+            typer.echo("❌ Solo server is not active. Please run 'solo setup' first.", err=True)
+            return
+
+        command = ["docker", "exec", "-it", "ollama", "ollama", "run", model]
+        
+        # Use subprocess.run with shell=True for interactive terminal
+        process = subprocess.run(
+            " ".join(command),
+            shell=True,
+            text=True
+        )
+    except subprocess.CalledProcessError as e:
+        typer.echo(f"❌ An error occurred: {e}", err=True)
diff --git a/solo_server/commands/serve.py b/solo_server/commands/serve.py
index c669dcc..d4eb7e0 100644
--- a/solo_server/commands/serve.py
+++ b/solo_server/commands/serve.py
@@ -1,20 +1,46 @@
+import requests
+import json
 import typer
-import subprocess
 
-def serve(name: str, model: str):
-    """
-    Serves a model using Ramalama.
-    """
-    typer.echo(f"🚀 Starting model {model} as {name}...")
+def serve(
+    model: str = typer.Option("llama3.2", "--model", "-m", help="Model to use"),
+    input: str = typer.Option("Hello", "--input", "-i", help="Input text for inference"),
+    stream: bool = typer.Option(False, "--stream", "-s", help="Enable streaming mode")
+):
+    # API Endpoint
+    url = "http://localhost:11434/api/chat"
 
-    try:
-        command = ["ramalama", "serve", model]
-        process = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    # Chat request payload
+    data = {
+        "model": model, 
+        "messages": [
+            {
+                "role": "user",
+                "content": input
+            }
+        ],
+        "stream": stream  # Set to True for streaming
+    }
 
-        typer.echo(f"✅ Model {model} is now running as {name}.")
-        typer.echo(f"🌐 Access the UI at: http://127.0.0.1:5070")
+    if data["stream"] == False:
+        # Sending POST request
+        response = requests.post(url, json=data)
+        # Check if response is valid JSON
+        try:
+            response_json = response.json()
+            if "message" in response_json and "content" in response_json["message"]:
+                print("Assistant Response:", response_json["message"]["content"])
+            else:
+                print("Unexpected Response:", json.dumps(response_json, indent=2))
+        except json.JSONDecodeError:
+            print("Error: API did not return valid JSON.")
+            print("Raw Response:", response.text)
 
-    except subprocess.CalledProcessError as e:
-        typer.echo(f"❌ Failed to serve model {model}: {e.stderr}", err=True)
-    except Exception as e:
-        typer.echo(f"⚠️ Unexpected error: {e}", err=True)
+
+    else:
+        with requests.post(url, json=data, stream=True) as response:
+            for line in response.iter_lines():
+                if line:
+                    json_obj = json.loads(line)
+                    if "message" in json_obj and "content" in json_obj["message"]:
+                        print(json_obj["message"]["content"], end="", flush=True)  # Streaming output
diff --git a/solo_server/commands/status.py b/solo_server/commands/status.py
index f5a8512..d87e78c 100644
--- a/solo_server/commands/status.py
+++ b/solo_server/commands/status.py
@@ -1,10 +1,12 @@
 import typer
 import subprocess
+from solo_server.utils.hardware import display_hardware_info
 
 app = typer.Typer()
 
 @app.command()
 def status():
-    """Check running models."""
-    typer.echo("Checking running model containers...")
-    subprocess.run(["podman", "ps", "--filter", "name=solo-container"], check=True)
+    """Check running models and system status."""
+    display_hardware_info(typer)
+    typer.echo("\n🔍 Running Models:")
+    subprocess.run(["docker", "ps"], check=True)
diff --git a/solo_server/commands/stop.py b/solo_server/commands/stop.py
index f0f576c..46eee66 100644
--- a/solo_server/commands/stop.py
+++ b/solo_server/commands/stop.py
@@ -1,17 +1,34 @@
 import typer
 import subprocess
 
-def stop(name: str):
+def stop(name: str = ""):
     """
-    Stops a running model container using Ramalama.
+    Stops the Ollama Docker container and any running models.
     """
-    typer.echo(f"🛑 Stopping {name} using Ramalama...")
+    typer.echo("🛑 Stopping Solo Server...")
 
     try:
-        subprocess.run(["ramalama", "stop", name], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        typer.echo(f"✅ {name} stopped successfully.")
+        # Stop the Docker container
+        subprocess.run(
+            ["docker", "stop", "ollama"],
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        typer.echo("✅ Solo server stopped successfully.")
+
+        # # Remove the container
+        # subprocess.run(
+        #     ["docker", "rm", "ollama"],
+        #     check=True,
+        #     stdout=subprocess.PIPE,
+        #     stderr=subprocess.PIPE,
+        #     text=True
+        # )
+        # typer.echo("🗑️ Ollama container removed.")
 
     except subprocess.CalledProcessError as e:
-        typer.echo(f"❌ Failed to stop {name}: {e.stderr}", err=True)
+        typer.echo(f"❌ Failed to stop Solo Server: {e.stderr}", err=True)
     except Exception as e:
         typer.echo(f"⚠️ Unexpected error: {e}", err=True)
diff --git a/solo_server/setup.py b/solo_server/setup.py
index 73aeecb..5d37e04 100644
--- a/solo_server/setup.py
+++ b/solo_server/setup.py
@@ -1,151 +1,87 @@
 import typer
-import os
-import configparser
-import platform
 import subprocess
+import shutil
+import time
+from .utils.hardware import display_hardware_info
 
-CONFIG_FILE = os.path.expanduser("~/.solo/solo.conf")
+def start():
 
-def detect_hardware():
-    """
-    Detects system hardware (CPU, GPU, RAM) and suggests optimal configurations.
-    """
-    typer.echo("🖥️ Detecting hardware specifications...")
-
-    # Detect CPU
-    cpu_model = "Unknown"
-    cpu_cores = os.cpu_count() or 1
-
-    if platform.system() == "Windows":
-        cpu_model = platform.processor()
-    elif platform.system() == "Linux":
-        try:
-            cpu_model = subprocess.check_output("lscpu | grep 'Model name'", shell=True, text=True).split(":")[1].strip()
-        except:
-            cpu_model = "Unknown Linux CPU"
-    elif platform.system() == "Darwin":
-        try:
-            cpu_model = subprocess.check_output("sysctl -n machdep.cpu.brand_string", shell=True, text=True).strip()
-        except:
-            cpu_model = "Unknown Mac CPU"
-
-    # Detect RAM
-    memory_gb = "Unknown"
-    if platform.system() == "Windows":
-        try:
-            memory_gb = int(subprocess.check_output("wmic ComputerSystem get TotalPhysicalMemory", shell=True, text=True).split("\n")[1].strip()) // (1024**3)
-        except:
-            memory_gb = "Unknown"
-    elif platform.system() == "Linux":
-        try:
-            memory_gb = int(subprocess.check_output("free -g | awk '/^Mem:/{print $2}'", shell=True, text=True).strip())
-        except:
-            memory_gb = "Unknown"
-    elif platform.system() == "Darwin":
-        try:
-            memory_gb = int(subprocess.check_output("sysctl -n hw.memsize", shell=True, text=True)) // (1024**3)
-        except:
-            memory_gb = "Unknown"
-
-    # Detect GPU
-    gpu_vendor = "None"
-    gpu_model = "None"
-
-    if platform.system() == "Windows":
-        try:
-            gpu_info = subprocess.check_output("wmic path win32_VideoController get Name", shell=True, text=True).split("\n")[1].strip()
-            if "NVIDIA" in gpu_info:
-                gpu_vendor = "NVIDIA"
-            elif "AMD" in gpu_info:
-                gpu_vendor = "AMD"
-            elif "Intel" in gpu_info:
-                gpu_vendor = "Intel"
-            gpu_model = gpu_info
-        except:
-            gpu_vendor = "Unknown"
-            gpu_model = "Unknown"
-    elif platform.system() == "Linux":
-        try:
-            if subprocess.run("nvidia-smi", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
-                gpu_vendor = "NVIDIA"
-                gpu_model = subprocess.check_output("nvidia-smi --query-gpu=name --format=csv,noheader", shell=True, text=True).split("\n")[0]
-            elif subprocess.run("rocm-smi", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
-                gpu_vendor = "AMD"
-                gpu_model = subprocess.check_output("rocm-smi --showproductname | awk -F ': ' '{print $2}'", shell=True, text=True).strip()
-            elif subprocess.run("lspci | grep -i vga", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
-                gpu_vendor = "Intel"
-                gpu_model = subprocess.check_output("lspci | grep -i vga | awk -F ': ' '{print $2}'", shell=True, text=True).strip()
-        except:
-            gpu_vendor = "Unknown"
-            gpu_model = "Unknown"
-    elif platform.system() == "Darwin":
-        try:
-            gpu_vendor = "Apple Silicon"
-            gpu_model = "Integrated GPU"
-        except:
-            gpu_vendor = "Unknown"
-            gpu_model = "Unknown"
-
-    typer.echo(f"🖥️ CPU: {cpu_model} ({cpu_cores} cores)")
-    typer.echo(f"💾 RAM: {memory_gb} GB")
-    typer.echo(f"🎮 GPU: {gpu_vendor} - {gpu_model}")
-
-    # Recommend Compute Backend
-    if gpu_vendor == "NVIDIA":
-        compute_backend = "CUDA"
-    elif gpu_vendor == "AMD":
-        compute_backend = "HIP"
-    elif gpu_vendor == "Intel":
-        compute_backend = "SYCL"
-    elif gpu_vendor == "Apple Silicon":
-        compute_backend = "Metal"
-    else:
-        compute_backend = "CPU"
-
-    typer.echo(f"⚙️ Recommended Compute Backend: {compute_backend}")
-
-    return cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, compute_backend
-
-def interactive_setup():
-    """
-    Runs an interactive setup to configure Solo CLI with hardware detection.
-    """
-    typer.echo("🔧 Welcome to Solo Setup!")
-    typer.echo("Let'sconfigure your settings and API keys.")
-
-    # Ensure config directory exists
-    os.makedirs(os.path.dirname(CONFIG_FILE), exist_ok=True)
-
-    config = configparser.ConfigParser()
-    config["DEFAULT"] = {}
-
-    # Detect hardware
-    cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, compute_backend = detect_hardware()
-
-    # User Inputs
-    config["DEFAULT"]["MODEL_REGISTRY"] = typer.prompt("🌍 Model registry (ramalama/ollama)", default="ramalama")
-    config["DEFAULT"]["MODEL_PATH"] = typer.prompt("📂 Model storage path", default=os.path.expanduser("~/solo/models"))
-    config["DEFAULT"]["COMPUTE_BACKEND"] = typer.prompt(f"⚙️ Compute backend (CPU/CUDA/HIP/SYCL/Vulkan) [Recommended: {compute_backend}]", default=compute_backend)
-    config["DEFAULT"]["SERVER_PORT"] = typer.prompt("🌐 Server port", default="5070")
-    config["DEFAULT"]["LOG_LEVEL"] = typer.prompt("🔍 Logging level (INFO/DEBUG/ERROR)", default="INFO")
-
-    # API Keys
-    typer.echo("🔑 Enter API keys (leave blank to skip).")
-    config["DEFAULT"]["NGROK_API_KEY"] = typer.prompt("Ngrok API Key", default="", show_default=False)
-    config["DEFAULT"]["REPLICATE_API_KEY"] = typer.prompt("Replicate API Key", default="", show_default=False)
+    """Setup solo-server environment."""
     
-    # Store detected hardware details
-    config["DEFAULT"]["CPU_MODEL"] = cpu_model
-    config["DEFAULT"]["CPU_CORES"] = str(cpu_cores)
-    config["DEFAULT"]["MEMORY_GB"] = str(memory_gb)
-    config["DEFAULT"]["GPU_VENDOR"] = gpu_vendor
-    config["DEFAULT"]["GPU_MODEL"] = gpu_model
-
-    # Save to file
-    with open(CONFIG_FILE, "w") as configfile:
-        config.write(configfile)
-
-    typer.echo("✅ Setup complete! Run `solo --help` to get started.")
+    display_hardware_info(typer)
+    typer.echo("\n🚀 Setting up Solo Server...")
+    
+    if not shutil.which("docker"):
+        typer.echo("❌ Docker is not installed. Please install Docker first.", err=True)
+        return
+
+    try:
+        # Check if Docker daemon is running
+        subprocess.run(["docker", "info"], check=True, capture_output=True)
+        
+        # Check if container exists (running or stopped)
+        container_exists = subprocess.run(
+            ["docker", "ps", "-aq", "-f", "name=ollama"], 
+            capture_output=True, 
+            text=True
+        ).stdout.strip()
+
+        if container_exists:
+            # Check if container is running
+            check_cmd = ["docker", "ps", "-q", "-f", "name=ollama"]
+            is_running = subprocess.run(check_cmd, capture_output=True, text=True).stdout.strip()
+            if not is_running:
+                subprocess.run(["docker", "start", "ollama"], check=True, capture_output=True)
+        else:
+            # Pull Ollama image
+            typer.echo("📥 Pulling Ollama Docker image...")
+            subprocess.run(["docker", "pull", "ollama/ollama"], check=True)
+
+            # Check if port is available
+            try:
+                subprocess.run(
+                    ["docker", "run", "--rm", "-p", "11434:11434", "alpine", "true"], 
+                    check=True, 
+                    capture_output=True
+                )
+            except subprocess.CalledProcessError:
+                typer.echo("❌ Port 11434 is already in use", err=True)
+                return
+
+            # Start Ollama container
+            typer.echo("🚀 Starting Solo Server...")
+            subprocess.run([
+                "docker", "run", "-d",
+                "--name", "solo",
+                "-v", "ollama:/root/.ollama",
+                "-p", "11434:11434",
+                "ollama/ollama"
+            ], check=True)
+
+        # Wait for container to be ready with timeout
+        timeout = 30
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            try:
+                subprocess.run(
+                    ["docker", "exec", "ollama", "ollama", "list"],
+                    check=True,
+                    stdout=subprocess.DEVNULL  # Only suppress stdout
+                )
+                typer.echo("✅ Solo server is ready!")
+                return
+            except subprocess.CalledProcessError:
+                time.sleep(1)
+        
+        typer.echo("❌ Solo server failed to start within timeout", err=True)
+
+    except subprocess.CalledProcessError as e:
+        typer.echo(f"❌ Docker command failed: {e}", err=True)
+        # Cleanup on failure
+        if container_exists:
+            subprocess.run(["docker", "stop", "ollama"], check=False)
+    except Exception as e:
+        typer.echo(f"❌ Unexpected error: {e}", err=True)
 
 if __name__ == "__main__":
-    interactive_setup()
+    start()
diff --git a/solo_server/utils/__init__.py b/solo_server/utils/__init__.py
new file mode 100644
index 0000000..62da138
--- /dev/null
+++ b/solo_server/utils/__init__.py
@@ -0,0 +1 @@
+# solo_server/__init__.py
diff --git a/solo_server/utils/hardware.py b/solo_server/utils/hardware.py
new file mode 100644
index 0000000..e4d6691
--- /dev/null
+++ b/solo_server/utils/hardware.py
@@ -0,0 +1,75 @@
+import platform
+import psutil
+import GPUtil
+import subprocess
+from typing import Tuple
+
+def detect_hardware() -> Tuple[str, int, float, str, str, float, str, str]:
+    #OS Info
+    os = platform.system()
+    
+    # CPU Info
+    cpu_model = "Unknown"
+    cpu_cores = psutil.cpu_count(logical=False)
+    
+    if os == "Windows":
+        cpu_model = platform.processor()
+    elif os == "Linux":
+        try:
+            cpu_model = subprocess.check_output("lscpu | grep 'Model name'", shell=True, text=True).split(":")[1].strip()
+        except:
+            cpu_model = "Unknown Linux CPU"
+    elif platform.system() == "Darwin":
+        try:
+            cpu_model = subprocess.check_output("sysctl -n machdep.cpu.brand_string", shell=True, text=True).strip()
+        except:
+            cpu_model = "Unknown Mac CPU"
+    
+    # Memory Info
+    memory_gb = round(psutil.virtual_memory().total / (1024**3), 2)
+    
+    # GPU Info
+    gpu_vendor = "None"
+    gpu_model = "None"
+    compute_backend = "CPU"
+    try:
+        gpus = GPUtil.getGPUs()
+        if gpus:
+            gpu = gpus[0]  # Get first GPU
+            gpu_model = gpu.name
+            gpu_memory = round(gpu.memoryTotal, 2)  # GPU memory in GB
+            if "NVIDIA" in gpu_model:
+                gpu_vendor = "NVIDIA"
+                compute_backend = "CUDA"
+            elif "AMD" in gpu_model:
+                gpu_vendor = "AMD"
+                compute_backend = "HIP"
+            elif "Intel" in gpu_model:
+                gpu_vendor = "Intel"
+                compute_backend = "OpenCL"
+            elif "Apple Silicon" in gpu_model:
+                gpu_vendor = "Apple Silicon"
+                compute_backend = "Metal"
+            else:
+                gpu_vendor = "Unknown"
+                compute_backend = "CPU"
+    except:
+        gpu_memory = 0.0
+        pass
+    
+    return cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, gpu_memory, compute_backend, os
+
+def display_hardware_info(typer):
+    cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, gpu_memory, compute_backend, os = detect_hardware()
+    
+    typer.echo("------------------------------->")
+    typer.echo("🖥️  System Information")
+    typer.echo(f"Operating System: {os}")
+    typer.echo(f"CPU: {cpu_model}")
+    typer.echo(f"CPU Cores: {cpu_cores}")
+    typer.echo(f"Memory: {memory_gb}GB")
+    typer.echo(f"GPU: {gpu_vendor}")
+    typer.echo(f"GPU Model: {gpu_model}")
+    typer.echo(f'GPU Memory: {gpu_memory}GB')
+    typer.echo(f"Compute Backend: {compute_backend}")
+    
\ No newline at end of file

From fa1261d63a84142f85b6e3ad9bc82af4b6702643 Mon Sep 17 00:00:00 2001
From: Zeeshaan Mohammed <zeeshaanuddin6@gmail.com>
Date: Tue, 4 Feb 2025 20:16:16 -0800
Subject: [PATCH 2/4] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bbdae23..299729a 100644
--- a/README.md
+++ b/README.md
@@ -127,8 +127,8 @@ http://127.0.0.1:5070  #SOLO_SERVER_PORT
 | solo run llama3.2 |
 |                   |
 +---------+---------+
-	      |
-	      |
+          |
+          |
           |           +------------------+           +----------------------+
           |           | Pull inferencing |           |   Pull model layer   |
           +-----------| runtime (cuda)   |---------->|       llama3.2       | 

From 93bca1a1116e886aa42d74b83feececd111a3671 Mon Sep 17 00:00:00 2001
From: Zeeshaan Mohammed <zeeshaanuddin6@gmail.com>
Date: Wed, 5 Feb 2025 18:42:29 -0800
Subject: [PATCH 3/4] nvidia cuda support

---
 setup.py                       |   3 +-
 solo_server/cli.py             |   5 +-
 solo_server/commands/run.py    |   6 +-
 solo_server/commands/status.py |  38 ++++++-
 solo_server/commands/stop.py   |   2 +-
 solo_server/setup.py           |  87 -----------------
 solo_server/start.py           | 174 +++++++++++++++++++++++++++++++++
 7 files changed, 218 insertions(+), 97 deletions(-)
 delete mode 100644 solo_server/setup.py
 create mode 100644 solo_server/start.py

diff --git a/setup.py b/setup.py
index 4206c92..167151d 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,8 @@
         "typer",
         "GPUtil",
         "psutil",
-        "requests",  
+        "requests", 
+        "tabulate", 
     ],
     extras_require={
         "dev": ["pytest", "black", "isort"],
diff --git a/solo_server/cli.py b/solo_server/cli.py
index bd07026..b7b3a3e 100644
--- a/solo_server/cli.py
+++ b/solo_server/cli.py
@@ -1,13 +1,12 @@
 import typer
-from .commands import run, serve, stop, status
-from .setup import start    
+from .commands import run, stop, status
+from .start import start    
 app = typer.Typer()
 
 # Commands
 app.command()(run.run)
 app.command()(stop.stop)
 app.command()(status.status)
-app.command()(serve.serve)
 app.command()(start)
 
 if __name__ == "__main__":
diff --git a/solo_server/commands/run.py b/solo_server/commands/run.py
index 259b566..79a8972 100644
--- a/solo_server/commands/run.py
+++ b/solo_server/commands/run.py
@@ -9,12 +9,12 @@ def run(model: str):
 
     # Check if Docker container is running
     try:
-        check_cmd = ["docker", "ps", "-q", "-f", "name=ollama"]
+        check_cmd = ["docker", "ps", "-q", "-f", "name=solo"]
         if not subprocess.run(check_cmd, capture_output=True, text=True).stdout:
-            typer.echo("❌ Solo server is not active. Please run 'solo setup' first.", err=True)
+            typer.echo("❌ Solo server is not active. Please start solo server first.", err=True)
             return
 
-        command = ["docker", "exec", "-it", "ollama", "ollama", "run", model]
+        command = ["docker", "exec", "-it", "solo", "ollama", "run", model]
         
         # Use subprocess.run with shell=True for interactive terminal
         process = subprocess.run(
diff --git a/solo_server/commands/status.py b/solo_server/commands/status.py
index d87e78c..2e2c7ec 100644
--- a/solo_server/commands/status.py
+++ b/solo_server/commands/status.py
@@ -1,6 +1,8 @@
 import typer
 import subprocess
 from solo_server.utils.hardware import display_hardware_info
+from tabulate import tabulate
+import json
 
 app = typer.Typer()
 
@@ -8,5 +10,37 @@
 def status():
     """Check running models and system status."""
     display_hardware_info(typer)
-    typer.echo("\n🔍 Running Models:")
-    subprocess.run(["docker", "ps"], check=True)
+    
+    # Check for running solo container
+    container_result = subprocess.run(["docker", "ps", "-f", "name=solo", "--format", "{{json .}}"],
+                                    capture_output=True, text=True, check=True)
+    
+    if container_result.stdout.strip():
+        # Container is running, show available models
+        typer.echo("\n🔍 Available Models:")
+        models_result = subprocess.run(["docker", "exec", "solo", "ollama", "list"], 
+                                     capture_output=True, text=True, check=True)
+        models = []
+        for line in models_result.stdout.strip().split('\n'):
+            parts = line.split()
+            if len(parts) >= 7:
+                size = f"{parts[2]} {parts[3]}"
+                modified = f"{parts[4]} {parts[5]} {parts[6]}"
+                models.append([parts[0], parts[1], size, modified])
+
+        if models:
+            print(tabulate(models, headers=['NAME', 'ID', 'SIZE', 'MODIFIED'], tablefmt='grid'))
+    
+    # Show running containers section (will be empty if none running)
+    typer.echo("\n🔍 Running Containers:")
+    containers = []
+    if container_result.stdout.strip():
+        for line in container_result.stdout.strip().split('\n'):
+            container = json.loads(line)
+            containers.append([
+                container['Names'],
+                container['Status'],
+                container['Ports']
+            ])
+    
+    print(tabulate(containers, headers=['NAME', 'STATUS', 'PORTS'], tablefmt='grid'))
diff --git a/solo_server/commands/stop.py b/solo_server/commands/stop.py
index 46eee66..e9486a8 100644
--- a/solo_server/commands/stop.py
+++ b/solo_server/commands/stop.py
@@ -10,7 +10,7 @@ def stop(name: str = ""):
     try:
         # Stop the Docker container
         subprocess.run(
-            ["docker", "stop", "ollama"],
+            ["docker", "stop", "solo"],
             check=True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
diff --git a/solo_server/setup.py b/solo_server/setup.py
deleted file mode 100644
index 5d37e04..0000000
--- a/solo_server/setup.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import typer
-import subprocess
-import shutil
-import time
-from .utils.hardware import display_hardware_info
-
-def start():
-
-    """Setup solo-server environment."""
-    
-    display_hardware_info(typer)
-    typer.echo("\n🚀 Setting up Solo Server...")
-    
-    if not shutil.which("docker"):
-        typer.echo("❌ Docker is not installed. Please install Docker first.", err=True)
-        return
-
-    try:
-        # Check if Docker daemon is running
-        subprocess.run(["docker", "info"], check=True, capture_output=True)
-        
-        # Check if container exists (running or stopped)
-        container_exists = subprocess.run(
-            ["docker", "ps", "-aq", "-f", "name=ollama"], 
-            capture_output=True, 
-            text=True
-        ).stdout.strip()
-
-        if container_exists:
-            # Check if container is running
-            check_cmd = ["docker", "ps", "-q", "-f", "name=ollama"]
-            is_running = subprocess.run(check_cmd, capture_output=True, text=True).stdout.strip()
-            if not is_running:
-                subprocess.run(["docker", "start", "ollama"], check=True, capture_output=True)
-        else:
-            # Pull Ollama image
-            typer.echo("📥 Pulling Ollama Docker image...")
-            subprocess.run(["docker", "pull", "ollama/ollama"], check=True)
-
-            # Check if port is available
-            try:
-                subprocess.run(
-                    ["docker", "run", "--rm", "-p", "11434:11434", "alpine", "true"], 
-                    check=True, 
-                    capture_output=True
-                )
-            except subprocess.CalledProcessError:
-                typer.echo("❌ Port 11434 is already in use", err=True)
-                return
-
-            # Start Ollama container
-            typer.echo("🚀 Starting Solo Server...")
-            subprocess.run([
-                "docker", "run", "-d",
-                "--name", "solo",
-                "-v", "ollama:/root/.ollama",
-                "-p", "11434:11434",
-                "ollama/ollama"
-            ], check=True)
-
-        # Wait for container to be ready with timeout
-        timeout = 30
-        start_time = time.time()
-        while time.time() - start_time < timeout:
-            try:
-                subprocess.run(
-                    ["docker", "exec", "ollama", "ollama", "list"],
-                    check=True,
-                    stdout=subprocess.DEVNULL  # Only suppress stdout
-                )
-                typer.echo("✅ Solo server is ready!")
-                return
-            except subprocess.CalledProcessError:
-                time.sleep(1)
-        
-        typer.echo("❌ Solo server failed to start within timeout", err=True)
-
-    except subprocess.CalledProcessError as e:
-        typer.echo(f"❌ Docker command failed: {e}", err=True)
-        # Cleanup on failure
-        if container_exists:
-            subprocess.run(["docker", "stop", "ollama"], check=False)
-    except Exception as e:
-        typer.echo(f"❌ Unexpected error: {e}", err=True)
-
-if __name__ == "__main__":
-    start()
diff --git a/solo_server/start.py b/solo_server/start.py
new file mode 100644
index 0000000..479361c
--- /dev/null
+++ b/solo_server/start.py
@@ -0,0 +1,174 @@
+import typer
+import subprocess
+import shutil
+import time
+from .utils.hardware import detect_hardware, display_hardware_info
+
+def check_nvidia_toolkit() -> bool:
+    """
+    Checks if Docker can actually run a GPU container using the NVIDIA runtime.
+    """
+    try:
+        test_cmd = [
+            "docker", "run", "--rm", "--gpus", "all",
+            "nvidia/cuda:11.0.3-base-ubuntu20.04", "nvidia-smi"
+        ]
+        subprocess.run(test_cmd, check=True, capture_output=True, text=True)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+    
+
+def install_nvidia_toolkit_linux():
+    """
+    Installs the NVIDIA Container Toolkit on Linux (Debian & Ubuntu).
+    """
+    typer.echo("Configuring the repository")
+    try:
+        subprocess.run("curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg", shell=True, check=True)
+        subprocess.run("curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list", shell=True, check=True)
+        subprocess.run("sudo apt-get update", shell=True, check=True)
+
+        typer.echo("Installing Nvidia Container Toolkit")
+        subprocess.run("sudo apt-get install -y nvidia-container-toolkit", shell=True, check=True)
+        subprocess.run("sudo nvidia-ctk runtime configure --runtime=docker", shell=True, check=True)
+        subprocess.run("sudo systemctl restart docker", shell=True, check=True)
+
+        typer.echo("NVIDIA Container Toolkit installed successfully on Linux.")
+    except subprocess.CalledProcessError as e:
+        typer.echo(f"Failed to install NVIDIA Container Toolkit on Linux. Error: {e}", err=True)
+
+
+def install_nvidia_toolkit_windows():
+    """
+    Provide a structured step-by-step guide for Windows users to configure
+    their system for NVIDIA GPU support, including driver & CUDA installation.
+    """
+    # Step-by-step instructions
+    typer.secho("\n========================================", fg=typer.colors.CYAN)
+    typer.secho(" Windows NVIDIA GPU Setup ", fg=typer.colors.CYAN, bold=True)
+    typer.secho("========================================\n", fg=typer.colors.CYAN)
+
+    typer.echo("Follow these steps to enable NVIDIA GPU support on Windows:\n")
+
+    steps = [
+        ("Step 1: Install or Update NVIDIA Drivers", "https://www.nvidia.com/Download/index.aspx"),
+        ("Step 2: Install the NVIDIA CUDA Toolkit", "https://developer.nvidia.com/cuda-downloads")
+    ]
+    for step_num, (step_title, link) in enumerate(steps, start=1):
+        typer.secho(f"{step_title}", fg=typer.colors.BRIGHT_GREEN)
+        typer.echo(f"   Link: {link}\n")
+
+    typer.echo("Once you've completed the above steps:")
+    typer.echo(" - Ensure Docker Desktop is installed and running.")
+    typer.echo(" - Enable 'Use the WSL 2 based engine' in Docker Desktop settings.\n")
+    
+    typer.secho("⚠️  Please restart Solo Server after installing the required tools.", fg=typer.colors.YELLOW)
+    raise typer.Exit(1)
+
+def start():
+
+    """Setup solo-server environment."""
+    
+    display_hardware_info(typer)
+    typer.echo("\n🚀 Setting up Solo Server...")
+    
+    if not shutil.which("docker"):
+        typer.echo(
+            "❌ Docker is not installed. Please install Docker first.\n"
+            "Link: https://docs.docker.com/get-docker/\n",
+            err=True
+        )
+        raise typer.Exit(code=1)
+    
+    try:
+        # Check if Docker daemon is running
+        subprocess.run(["docker", "info"], check=True, capture_output=True)
+        cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, gpu_memory, compute_backend, os = detect_hardware()
+        use_gpu = False
+
+        if gpu_vendor == "NVIDIA":
+            if check_nvidia_toolkit():
+                typer.echo("✅ NVIDIA Docker Toolkit is already installed.\n")
+                use_gpu = True
+            else:
+                if typer.confirm("NVIDIA GPU detected but Toolkit is not installed. Do you want to install it?", default=False):
+                    if os == "Linux":
+                        install_nvidia_toolkit_linux()
+                    elif os == "Windows":
+                        install_nvidia_toolkit_windows()
+                    else:
+                        typer.echo("Unsupported OS for automated NVIDIA toolkit installation.")
+                else:
+                    typer.echo("⚠️  Falling back to CPU.\n")
+
+        # Check if container exists (running or stopped)
+        container_exists = subprocess.run(
+            ["docker", "ps", "-aq", "-f", "name=solo"], 
+            capture_output=True, 
+            text=True
+        ).stdout.strip()
+
+        if container_exists:
+            # Check if container is running
+            check_cmd = ["docker", "ps", "-q", "-f", "name=solo"]
+            is_running = subprocess.run(check_cmd, capture_output=True, text=True).stdout.strip()
+            if not is_running:
+                subprocess.run(["docker", "start", "solo"], check=True, capture_output=True)
+        else:
+            # Pull Ollama image
+            typer.echo("📥 Pulling Ollama Registry...")
+            subprocess.run(["docker", "pull", "ollama/ollama"], check=True)
+
+            # Check if port is available
+            try:
+                subprocess.run(
+                    ["docker", "run", "--rm", "-p", "11434:11434", "alpine", "true"], 
+                    check=True, 
+                    capture_output=True
+                )
+            except subprocess.CalledProcessError:
+                typer.echo("❌ Port 11434 is already in use", err=True)
+                return
+
+            # Start Ollama container
+            docker_run_cmd = ["docker", "run", "-d", "--name", "solo", "-v", "ollama:/root/.ollama", "-p", "11434:11434"]
+            if use_gpu:
+                docker_run_cmd += ["--gpus", "all"]
+                docker_run_cmd.append("ollama/ollama")
+
+            typer.echo("🚀 Starting Solo Server...")
+            subprocess.run(docker_run_cmd, check=True)
+
+        # Wait for container to be ready with timeout
+        timeout = 30
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            try:
+                subprocess.run(
+                    ["docker", "exec", "solo", "ollama", "list"],
+                    check=True,
+                    stdout=subprocess.DEVNULL  # Only suppress stdout
+                )
+                typer.secho(
+                "✅ Solo server is ready!\nYou can now access the UI at: https://solo-chatbot.vercel.app/",
+                fg=typer.colors.BRIGHT_CYAN,
+                bold=True
+                )
+
+                return
+            except subprocess.CalledProcessError:
+                time.sleep(1)
+        
+        typer.echo("❌ Solo server failed to start within timeout", err=True)
+
+    except subprocess.CalledProcessError as e:
+        typer.echo(f"❌ Docker command failed: {e}", err=True)
+        # Cleanup on failure
+        if container_exists:
+            subprocess.run(["docker", "stop", "solo"], check=False)
+    except Exception as e:
+        typer.echo(f"❌ Unexpected error: {e}", err=True)
+
+if __name__ == "__main__":
+    start()

From cc90f60d2331b621f820c5478efb2bf47c6049d4 Mon Sep 17 00:00:00 2001
From: Zeeshaan Mohammed <zeeshaanuddin6@gmail.com>
Date: Thu, 6 Feb 2025 09:40:34 -0800
Subject: [PATCH 4/4] updated hardware.py

---
 solo_server/utils/hardware.py | 48 +++++++++++++++++------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/solo_server/utils/hardware.py b/solo_server/utils/hardware.py
index e4d6691..6002dfe 100644
--- a/solo_server/utils/hardware.py
+++ b/solo_server/utils/hardware.py
@@ -32,31 +32,29 @@ def detect_hardware() -> Tuple[str, int, float, str, str, float, str, str]:
     gpu_vendor = "None"
     gpu_model = "None"
     compute_backend = "CPU"
-    try:
-        gpus = GPUtil.getGPUs()
-        if gpus:
-            gpu = gpus[0]  # Get first GPU
-            gpu_model = gpu.name
-            gpu_memory = round(gpu.memoryTotal, 2)  # GPU memory in GB
-            if "NVIDIA" in gpu_model:
-                gpu_vendor = "NVIDIA"
-                compute_backend = "CUDA"
-            elif "AMD" in gpu_model:
-                gpu_vendor = "AMD"
-                compute_backend = "HIP"
-            elif "Intel" in gpu_model:
-                gpu_vendor = "Intel"
-                compute_backend = "OpenCL"
-            elif "Apple Silicon" in gpu_model:
-                gpu_vendor = "Apple Silicon"
-                compute_backend = "Metal"
-            else:
-                gpu_vendor = "Unknown"
-                compute_backend = "CPU"
-    except:
-        gpu_memory = 0.0
-        pass
-    
+    gpu_memory = 0
+
+    gpus = GPUtil.getGPUs()
+    if gpus:
+        gpu = gpus[0]  # Get first GPU
+        gpu_model = gpu.name
+        gpu_memory = round(gpu.memoryTotal, 2)  # GPU memory in GB
+        if "NVIDIA" in gpu_model:
+            gpu_vendor = "NVIDIA"
+            compute_backend = "CUDA"
+        elif "AMD" in gpu_model:
+            gpu_vendor = "AMD"
+            compute_backend = "HIP"
+        elif "Intel" in gpu_model:
+            gpu_vendor = "Intel"
+            compute_backend = "OpenCL"
+        elif "Apple Silicon" in gpu_model:
+            gpu_vendor = "Apple Silicon"
+            compute_backend = "Metal"
+        else:
+            gpu_vendor = "Unknown"
+            compute_backend = "CPU"
+
     return cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, gpu_memory, compute_backend, os
 
 def display_hardware_info(typer):