diff --git a/solo_server/Dockerfile.finetune b/solo_server/Dockerfile.finetune new file mode 100644 index 0000000..230d1b5 --- /dev/null +++ b/solo_server/Dockerfile.finetune @@ -0,0 +1,29 @@ +FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + gcc \ + g++ \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +RUN pip install --no-cache-dir \ + datasets \ + peft \ + typer \ + requests \ + transformers \ + triton \ + bitsandbytes \ + trl \ + accelerate \ + unsloth_zoo \ + "unsloth[cu121-torch221] @ git+https://github.com/unslothai/unsloth.git" + +# Set working directory +WORKDIR /app + +# Copy the application code +COPY . . diff --git a/solo_server/cli.py b/solo_server/cli.py index 343cad8..5e251a9 100644 --- a/solo_server/cli.py +++ b/solo_server/cli.py @@ -1,8 +1,11 @@ import typer from solo_server.commands import run, stop, status, benchmark, download_hf as download +from solo_server.commands import finetune from solo_server.main import setup app = typer.Typer() +finetune_app = typer.Typer() +app.add_typer(finetune_app, name="finetune") # Commands app.command()(run.run) @@ -12,6 +15,11 @@ app.command()(benchmark.benchmark) app.command()(setup) +# Finetune commands +finetune_app.command(name="gen")(finetune.gen) +finetune_app.command(name="status")(finetune.status) +finetune_app.command(name="download")(finetune.download) +finetune_app.command(name="run")(finetune.run) if __name__ == "__main__": app() diff --git a/solo_server/commands/finetune.py b/solo_server/commands/finetune.py new file mode 100644 index 0000000..3f83d6e --- /dev/null +++ b/solo_server/commands/finetune.py @@ -0,0 +1,302 @@ +import typer +import requests +import json +from typing import Optional +from pathlib import Path +import subprocess +import os +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich.text import Text +from rich.box import ROUNDED +from solo_server.config import CONFIG_PATH + +BASE_URL = "https://api.starfishdata.ai/v1" + +def get_starfish_api_key() -> str: + """Get Starfish API key from environment or config file""" + # First check environment variable + api_key = os.getenv('STARFISH_API_KEY', '') + + if not api_key: # If not in env, try config file + if os.path.exists(CONFIG_PATH): + with open(CONFIG_PATH, 'r') as f: + config = json.load(f) + api_key = config.get('starfish', {}).get('api_key', '') + + if not api_key: + if os.name in ["Linux", "Windows"]: + typer.echo("Use Ctrl + Shift + V to paste your token.") + api_key = typer.prompt("Please enter your Starfish API key") + + # Save token if provided + if api_key: + if os.path.exists(CONFIG_PATH): + with open(CONFIG_PATH, 'r') as f: + config = json.load(f) + else: + config = {} + + config['starfish'] = {'api_key': api_key} + with open(CONFIG_PATH, 'w') as f: + json.dump(config, f, indent=4) + + return api_key + +def gen( + prompt: str, + num_records: Optional[int] = typer.Option(100, "--num-records", "-n", help="Number of records to generate"), + model: Optional[str] = typer.Option("gpt-4o-mini-2024-07-18", "--model", "-m", help="Model to use for generation") +): + """ + Generate synthetic data using StarfishData API. + + Example: + solo finetune gen "Generate customer service conversations about product returns" + """ + api_key = get_starfish_api_key() + if not api_key: + typer.echo("❌ Starfish API key is required", err=True) + raise typer.Exit(1) + + data = { + "prompt": prompt, + "numOfRecords": num_records, + "model": model + } + + headers = { + 'Content-Type': 'application/json', + 'x-api-key': api_key + } + + try: + response = requests.post( + f'{BASE_URL}/generateData', + headers=headers, + data=json.dumps(data) + ) + response.raise_for_status() + + result = response.json() + console = Console() + + # Create a table + table = Table(show_header=False, box=ROUNDED) + table.add_column("Key", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Job ID", result.get('jobId')) + table.add_row("Project ID", result.get('projectId')) + + # Create a panel with success message and table + content = [ + Text("✅ Successfully started data generation", style="bold green"), + "", # Empty line + Text("Available commands:", style="yellow"), + Text(f"• Check status: solo finetune status {result.get('jobId')}", style="blue"), + Text(f"• Download data: solo finetune download {result.get('projectId')}", style="blue") + ] + + panel = Panel( + "\n".join(str(item) for item in content), + title="[bold magenta]Generation Details[/]", + border_style="bright_blue" + ) + console.print(panel) + except requests.exceptions.RequestException as e: + typer.echo(f"❌ Error: {str(e)}", err=True) + +def status(job_id: str): + """ + Check the status of a data generation job. + + Example: + solo finetune status "job-123-456" + """ + api_key = get_starfish_api_key() + if not api_key: + typer.echo("❌ Starfish API key is required", err=True) + raise typer.Exit(1) + + headers = { + 'Content-Type': 'application/json', + 'x-api-key': api_key + } + + data = { + "jobId": job_id + } + + try: + response = requests.post( + f'{BASE_URL}/jobStatus', + headers=headers, + data=json.dumps(data) + ) + response.raise_for_status() + + result = response.json() + status = result.get('status', 'UNKNOWN') + typer.echo(f"📊 Data generation status: {status}") + + if status == "COMPLETE": + typer.echo(f"✅ Data generation completed, Now you can download the data") + elif status == "FAILED": + typer.echo(f"❌ Error: {result.get('error')}") + except requests.exceptions.RequestException as e: + typer.echo(f"❌ Error: {str(e)}", err=True) + +def download( + project_id: str, + output: Optional[str] = typer.Option("data.json", "--output", "-o", help="Output file path") +): + """ + Download generated data for a project. + + Example: + solo finetune download "project-123-456" --output my_data.json + """ + api_key = get_starfish_api_key() + if not api_key: + typer.echo("❌ Starfish API key is required", err=True) + raise typer.Exit(1) + + headers = { + 'Content-Type': 'application/json', + 'x-api-key': api_key + } + + data = { + "projectId": project_id + } + + try: + response = requests.post( + f'{BASE_URL}/data', + headers=headers, + data=json.dumps(data) + ) + response.raise_for_status() + + result = response.json() + + # Save the data to a file + with open(output, 'w') as f: + json.dump(result, f, indent=2) + + typer.echo(f"✅ Successfully downloaded data to {output}") + typer.echo(f"📊 Number of records: {len(result['data'])}") + except requests.exceptions.RequestException as e: + typer.echo(f"❌ Error: {str(e)}", err=True) + except IOError as e: + typer.echo(f"❌ Error writing to file: {str(e)}", err=True) + +def run( + data_path: str = typer.Argument(..., help="Path to the JSON data file"), + output_dir: str = typer.Option("./finetuned_model", "--output-dir", "-o", help="Directory to save the finetuned model"), + batch_size: int = typer.Option(2, "--batch-size", "-b", help="Training batch size"), + epochs: int = typer.Option(1, "--epochs", "-e", help="Number of training epochs"), + learning_rate: float = typer.Option(2e-4, "--learning-rate", "-lr", help="Learning rate"), + lora_r: int = typer.Option(2, "--lora-r", help="LoRA attention dimension"), + lora_alpha: int = typer.Option(8, "--lora-alpha", help="LoRA alpha parameter"), + lora_dropout: float = typer.Option(0.05, "--lora-dropout", help="LoRA dropout value"), + rebuild_image: bool = typer.Option(False, "--rebuild-image", help="Force rebuild the Docker image"), +): + """ + Finetune a model on generated data using unsloth with LoRA in a Docker container. + + Example: + solo finetune run data.json --output-dir ./my_model --batch-size 8 + """ + try: + # Convert paths to absolute paths + data_path = os.path.abspath(data_path) + output_dir = os.path.abspath(output_dir) + + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Check if container exists (running or stopped) + container_exists = subprocess.run( + ["docker", "ps", "-aq", "-f", "name=solo-finetune"], + capture_output=True, + text=True + ).stdout.strip() + + if container_exists: + # Check if container is running + is_running = subprocess.run( + ["docker", "ps", "-q", "-f", "name=solo-finetune"], + capture_output=True, + text=True + ).stdout.strip() + + if is_running: + typer.echo("✅ Finetune is already running") + else: + subprocess.run(["docker", "start", "solo-finetune"], check=True) + else: + # Check if image exists + docker_finetune = "getsolo/finetune:latest" + image_exists = subprocess.run( + ["docker", "images", "-q", docker_finetune], + capture_output=True, + text=True + ).stdout.strip() + + if not image_exists or rebuild_image: + typer.echo("📥 Pulling finetune image...") + try: + subprocess.run(["docker", "pull", docker_finetune], check=True) + except subprocess.CalledProcessError as e: + typer.echo(f"❌ Error: {str(e)}", err=True) + raise typer.Exit(1) + + # Prepare arguments for the training script + training_args = { + "data_path": "/app/data.json", + "output_dir": "/app/output", + "epochs": epochs, + "batch_size": batch_size, + "learning_rate": learning_rate, + "lora_r": lora_r, + "lora_alpha": lora_alpha, + "lora_dropout": lora_dropout, + } + + # Convert arguments to command line format + args_list = [] + for key, value in training_args.items(): + args_list.extend([f"--{key.replace('_', '-')}", str(value)]) + + # Run the finetuning command in the container + docker_cmd = [ + "docker", "run", + "--name", "solo-finetune", + "--gpus", "all", # Enable GPU access + "-v", f"{data_path}:/app/data.json:ro", # Mount data file + "-v", f"{output_dir}:/app/output", # Mount output directory + docker_finetune, + "python", "./finetune_script.py", + *args_list + ] + + typer.echo("🚀 Starting finetuning process...") + subprocess.run(docker_cmd, check=True) + + typer.echo("✅ Finetuning completed successfully!") + typer.echo(f"📁 Model saved to: {output_dir}") + typer.echo(f"📁 GGUF Model converted and saved to {os.path.join(output_dir, 'gguf_path')}") + + except subprocess.CalledProcessError as e: + typer.echo(f"❌ Error during Docker operation: {str(e)}", err=True) + raise typer.Exit(1) + except Exception as e: + typer.echo(f"❌ Error: {str(e)}", err=True) + raise typer.Exit(1) + + + + diff --git a/solo_server/data.json b/solo_server/data.json new file mode 100644 index 0000000..559bfbe --- /dev/null +++ b/solo_server/data.json @@ -0,0 +1,57 @@ +✅ Successfully retrieved job status for job: 767d7ae6-f478-46c4-8469-d00d18e52efc +{ + "data": [ + { + "id": "ab50896d894d4154ba5c4c1d73e5d93a", + "data": "{\"question\":\"What is a popular food item at the Gilroy Garlic Festival?\",\"answer\":\"Garlic ice cream is a unique and popular food item served at the Gilroy Garlic Festival.\"}", + "topic": "California cities" + }, + { + "id": "942e67de3d9c437ea4793f9b8ade2ee4", + "data": "{\"question\":\"What major highways run through Gilroy?\",\"answer\":\"Major highways include U.S. Route 101 and California State Route 152.\"}", + "topic": "California cities" + }, + { + "id": "e66c4448d43142c6b3275090e7c04c8e", + "data": "{\"question\":\"What nearby cities are close to Gilroy?\",\"answer\":\"Nearby cities include Morgan Hill to the north and Hollister to the south.\"}", + "topic": "California cities" + }, + { + "id": "7a8f6e29d0e24b12872added76b68fc0", + "data": "{\"question\":\"Does Gilroy have any parks?\",\"answer\":\"Yes, Gilroy has several parks, including Christmas Hill Park and Gilroy Sports Park.\"}", + "topic": "California cities" + }, + { + "id": "94695387eb8f42ad9dea10cd498f7343", + "data": "{\"question\":\"When is the Gilroy Garlic Festival held?\",\"answer\":\"The Gilroy Garlic Festival is typically held in late July each year.\"}", + "topic": "California cities" + }, + { + "id": "5bc6d4f763c74fada8885cd5a3a0eb8f", + "data": "{\"question\":\"What is the climate like in Gilroy?\",\"answer\":\"Gilroy has a Mediterranean climate, characterized by hot, dry summers and mild, wet winters.\"}", + "topic": "California cities" + }, + { + "id": "a9a29fb13f8b4b9ea84ccfd3c0b89cba", + "data": "{\"question\":\"What kind of agriculture is prominent in Gilroy?\",\"answer\":\"Gilroy is known for its garlic production, as well as other crops like strawberries and wine grapes.\"}", + "topic": "California cities" + }, + { + "id": "d9890944793442338d445089408cbc0f", + "data": "{\"question\":\"What are popular attractions in Gilroy, California?\",\"answer\":\"Popular attractions in Gilroy include Gilroy Gardens Family Theme Park and the historic Gilroy Museum.\"}", + "topic": "California cities" + }, + { + "id": "6b59224a01244b76b8f9e736cdf8a103", + "data": "{\"question\":\"What is Gilroy known for?\",\"answer\":\"Gilroy is known as the 'Garlic Capital of the World' and hosts the annual Gilroy Garlic Festival.\"}", + "topic": "California cities" + }, + { + "id": "cde50fba0577480bb5f223ffced04781", + "data": "{\"question\":\"What is the population of Gilroy, California?\",\"answer\":\"As of the 2020 census, the population of Gilroy is approximately 60,000.\"}", + "topic": "California cities" + } + ], + "nextToken": null, + "success": true +} diff --git a/solo_server/finetune_script.py b/solo_server/finetune_script.py new file mode 100644 index 0000000..ed9cf9e --- /dev/null +++ b/solo_server/finetune_script.py @@ -0,0 +1,196 @@ +import json +from datasets import Dataset +from unsloth import FastLanguageModel, is_bfloat16_supported, standardize_sharegpt +from pathlib import Path +import typer +from peft import LoraConfig, TaskType +from trl import SFTTrainer +from transformers import TrainingArguments +import torch + +def run_training( + data_path: str, + output_dir: str, + epochs: int, + batch_size: int, + learning_rate: float, + lora_r: int, + lora_alpha: int, + lora_dropout: float, +): + """Run the finetuning process""" + + # Check GPU compatibility + if torch.cuda.is_available(): + gpu_name = torch.cuda.get_device_name() + compute_capability = torch.cuda.get_device_capability() + print(f"Found GPU: {gpu_name} with compute capability {compute_capability}") + + # Use 8-bit quantization for older GPUs + use_4bit = compute_capability[0] >= 8 # Use 4-bit only for Ampere (8.0) and newer + else: + print("No GPU found, using CPU mode") + use_4bit = False + + try: + print("Initializing model and tokenizer...") + # Initialize model with appropriate quantization + model, tokenizer = FastLanguageModel.from_pretrained( + model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B", + max_seq_length=2048, + dtype=None, + load_in_4bit=use_4bit, # Use 4-bit quantization only for compatible GPUs + load_in_8bit=not use_4bit, # Use 8-bit quantization for older GPUs + ) + print("Model and tokenizer initialized successfully") + + except Exception as e: + print(f"Error initializing model: {str(e)}") + raise + + try: + print("Applying PEFT configuration...") + model = FastLanguageModel.get_peft_model( + model, + r=lora_r, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_gradient_checkpointing="unsloth", + use_rslora=False, + random_state=3407, + ) + print("PEFT configuration applied successfully") + + except Exception as e: + print(f"Error applying PEFT configuration: {str(e)}") + raise + + with open(data_path) as f: + raw_data = json.load(f) + + dataset = prepare_dataset(raw_data, tokenizer) + + # Training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=epochs, + per_device_train_batch_size=batch_size, + gradient_accumulation_steps=4, + learning_rate=learning_rate, + logging_steps=10, + fp16=is_bfloat16_supported(), + bf16 = is_bfloat16_supported(), + warmup_ratio=0.03, + weight_decay=0.01, + optim="adamw_8bit", + lr_scheduler_type="linear", + seed=3407, + report_to="none", + ) + + # Initialize SFT trainer with eval_dataset + trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + dataset_num_proc=2, + args=training_args, + packing=False, + ) + + # Train + trainer.train() + + # Replace the saving code with this: + print("Saving model...") + try: + merged_path = Path(output_dir) / "merged_model" + print("Merging and saving full model...") + model.save_pretrained_merged( + merged_path, + tokenizer, + save_method="merged_16bit", # + ) + print(f"✓ Saved merged model to {merged_path}") + except Exception as e: + print(f"Warning: Could not save merged model: {e}") + print("Continuing with GGUF conversion...") + + # Save GGUF version + try: + gguf_path = Path(output_dir) / "gguf" + gguf_path.mkdir(exist_ok=True) + print("Converting model to GGUF format...") + + # Use the adapter model for GGUF conversion + model.save_pretrained_gguf( + str(gguf_path / "model"), + tokenizer, + quantization_method="q4_k_m", + ) + except Exception as e: + print(f"Warning: Could not save GGUF model: {e}") + + print("Training and saving completed!") + print(tokenizer._ollama_modelfile) + print(tokenizer._ollama_modelfile.read()) + + +def format_instruction(question: str, answer: str) -> str: + """Format a single Q&A pair into instruction format""" + return f"""You are a helpful assistant. Based on the following question, provide a relevant answer: + +### Question: +{question} + +### Response: +{answer}""" + +def prepare_dataset(raw_data: dict, tokenizer): + """Prepare dataset from raw data""" + formatted_data = [] + + for item in raw_data["data"]: + data_dict = json.loads(item["data"]) + formatted_text = format_instruction( + data_dict["question"], + data_dict["answer"] + ) + formatted_data.append({"text": formatted_text + tokenizer.eos_token}) + # Create dataset + dataset = Dataset.from_list(formatted_data) + + return dataset + +if __name__ == "__main__": + app = typer.Typer() + + @app.command() + def main( + data_path: str = typer.Option(..., "--data-path", help="Path to the JSON data file"), + output_dir: str = typer.Option(..., "--output-dir", help="Directory to save the model"), + epochs: int = typer.Option(..., "--epochs", help="Number of training epochs"), + batch_size: int = typer.Option(..., "--batch-size", help="Training batch size"), + learning_rate: float = typer.Option(..., "--learning-rate", help="Learning rate"), + lora_r: int = typer.Option(..., "--lora-r", help="LoRA attention dimension"), + lora_alpha: int = typer.Option(..., "--lora-alpha", help="LoRA alpha parameter"), + lora_dropout: float = typer.Option(..., "--lora-dropout", help="LoRA dropout value"), + ): + run_training( + data_path=data_path, + output_dir=output_dir, + epochs=epochs, + batch_size=batch_size, + learning_rate=learning_rate, + lora_r=lora_r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + ) + + app() \ No newline at end of file diff --git a/solo_server/main.py b/solo_server/main.py index 28c457b..177bebc 100644 --- a/solo_server/main.py +++ b/solo_server/main.py @@ -6,7 +6,7 @@ from enum import Enum from pathlib import Path from solo_server.utils.docker_utils import start_docker_engine -from solo_server.utils.hardware import detect_hardware, display_hardware_info +from solo_server.utils.hardware import detect_hardware, display_hardware_info, recommended_server from solo_server.utils.nvidia import check_nvidia_toolkit, install_nvidia_toolkit_linux, install_nvidia_toolkit_windows from solo_server.utils.server_utils import setup_vllm_server, setup_ollama_server, setup_llama_cpp_server @@ -28,6 +28,8 @@ def setup(): typer.echo("📊 Available Server Options:") for server in ServerType: typer.echo(f" • {server.value}") + + recmd_server = recommended_server(memory_gb, gpu_vendor, gpu_memory) def server_type_prompt(value: str) -> ServerType: normalized_value = value.lower() @@ -39,20 +41,18 @@ def server_type_prompt(value: str) -> ServerType: server_choice = typer.prompt( "\nChoose server", type=server_type_prompt, - default="ollama", + default=recmd_server, ) # GPU Configuration use_gpu = False - if gpu_vendor in ["NVIDIA", "AMD", "Intel", "Apple Silicon"]: - use_gpu = typer.confirm( - f"\n🎮 {gpu_vendor} GPU detected ({gpu_model}). Use GPU acceleration?", - default=True - ) + if gpu_vendor in ["NVIDIA", "AMD", "Intel", "Apple Silicon"]: + use_gpu = True if use_gpu and gpu_vendor == "NVIDIA": if not check_nvidia_toolkit(os_name): - if typer.confirm("NVIDIA toolkit not found. Install now?", default=True): + if typer.confirm("NVIDIA GPU Detected, but GPU drivers not found. Install now?", default=True): if os_name == "Linux": + install_nvidia_toolkit_linux() try: install_nvidia_toolkit_linux() except subprocess.CalledProcessError as e: @@ -76,50 +76,35 @@ def server_type_prompt(value: str) -> ServerType: typer.echo("❌ Docker is not installed or not in the system PATH. Please install Docker first.\n", err=True) typer.secho("Install Here: https://docs.docker.com/get-docker/", fg=typer.colors.GREEN) raise typer.Exit(code=1) - else: + + + try: + subprocess.run(["docker", "info"], check=True, capture_output=True, timeout=20) + except subprocess.CalledProcessError: + typer.echo("Docker daemon is not running. Attempting to start Docker...", err=True) + if not start_docker_engine(os_name): + raise typer.Exit(code=1) + # Re-check if Docker is running try: - subprocess.run([docker_path, "info"], check=True, capture_output=True, timeout=30) + subprocess.run(["docker", "info"], check=True, capture_output=True, timeout=20) except subprocess.CalledProcessError: - typer.echo("Docker daemon is not running. Attempting to start Docker...", err=True) - if not start_docker_engine(os_name): - raise typer.Exit(code=1) - # Re-check if Docker is running - try: - subprocess.run([docker_path, "info"], check=True, capture_output=True) - except subprocess.CalledProcessError: - typer.echo("Try running the terminal with admin privileges.", err=True) - raise typer.Exit(code=1) + typer.echo("Try restarting the terminal with admin privileges and close any instances of podman.", err=True) + raise typer.Exit(code=1) + # Server setup try: if server_choice == ServerType.VLLM: setup_success = setup_vllm_server(use_gpu, cpu_model, gpu_vendor) if setup_success: - def is_port_in_use(port: int) -> bool: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - return s.connect_ex(('localhost', port)) == 0 - - # Wait for the port to be in use - port = 8000 - timeout = 60 # seconds - start_time = time.time() - while time.time() - start_time < timeout: - if is_port_in_use(port): - typer.secho( - f"Access the API at: http://localhost:{port}", - fg=typer.colors.BLUE - ) - typer.secho( - "If you experience issues, check docker logs with 'docker logs solo-vllm'\n", + typer.secho( + "Access the API at: http://localhost:8000\n", + fg=typer.colors.BLUE + ) + typer.secho( + "If you experience any issues, check docker logs with 'docker logs solo-vllm'\n", fg=typer.colors.YELLOW ) - break - time.sleep(1) - else: - typer.secho( - f"Port {port} is not listening after {timeout} seconds. Please check docker logs for for more information.\n", - fg=typer.colors.RED - ) elif server_choice == ServerType.OLLAMA: setup_success = setup_ollama_server(use_gpu, gpu_vendor) @@ -155,8 +140,7 @@ def is_port_in_use(port: int) -> bool: typer.secho("\n✅ Custom API configuration saved!", fg=typer.colors.BRIGHT_GREEN) except Exception as e: - typer.echo(f"\n❌ Unexpected error: {e}", err=True) - typer.echo("Please check docker logs for more information.", err=True) + typer.echo(f"\n❌ Setup failed: {e}", err=True) raise typer.Exit(code=1) if __name__ == "__main__": diff --git a/solo_server/utils/docker_utils.py b/solo_server/utils/docker_utils.py index 51088ac..5a36b45 100644 --- a/solo_server/utils/docker_utils.py +++ b/solo_server/utils/docker_utils.py @@ -52,7 +52,7 @@ def start_docker_engine(os_name): subprocess.run(["open", "/Applications/Docker.app"], check=True, capture_output=True) # Wait for Docker to start - timeout = 60 + timeout = 30 start_time = time.time() while time.time() - start_time < timeout: try: diff --git a/solo_server/utils/hardware.py b/solo_server/utils/hardware.py index 516780b..5c3db76 100644 --- a/solo_server/utils/hardware.py +++ b/solo_server/utils/hardware.py @@ -1,6 +1,7 @@ import platform import psutil import GPUtil +import typer import subprocess import os import json @@ -64,6 +65,26 @@ def detect_hardware() -> Tuple[str, int, float, str, str, float, str, str]: return cpu_model, cpu_cores, memory_gb, gpu_vendor, gpu_model, gpu_memory, compute_backend, os_name +def recommended_server(memory_gb, gpu_vendor, gpu_memory) -> str: + """ + Determines the recommended server based on hardware specifications. + Returns the recommended server type after displaying the recommendation. + """ + # vLLM recommendation criteria + if (gpu_vendor in ["NVIDIA","AMD","Intel"] and gpu_memory >= 8) and (memory_gb >= 16): + typer.echo(f"\n✨ vLLM Recommended for your system") + return "vLLM" + + # Ollama recommendation criteria + elif (gpu_vendor in ["NVIDIA", "AMD"] and gpu_memory >= 6) or (memory_gb >= 16): + typer.echo(f"\n✨ Ollama is recommended for your system") + return "ollama" + + # Llama.cpp recommendation criteria + else: + typer.echo("\n✨ Llama.cpp is recommended for your system") + return "llama.cpp" + def display_hardware_info(typer): # Check if system info exists in config file @@ -127,3 +148,15 @@ def display_hardware_info(typer): title="[bold cyan]System Information[/]" ) console.print(panel) + + # After displaying the hardware panel, show the recommendation + recommended_server, reasoning = get_recommended_server() + typer.secho( + "\n💡 Recommended Server:", + fg=typer.colors.BRIGHT_CYAN, + bold=True + ) + typer.secho( + f"► {recommended_server}: {reasoning}", + fg=typer.colors.BRIGHT_GREEN + ) diff --git a/solo_server/utils/hf_utils.py b/solo_server/utils/hf_utils.py index 530582a..ebd7d82 100644 --- a/solo_server/utils/hf_utils.py +++ b/solo_server/utils/hf_utils.py @@ -20,4 +20,3 @@ def get_available_models(repo_id: str, suffix: list[str] | str = ".gguf") -> lis except Exception as e: typer.echo(f"Error fetching models from {repo_id}: {e}") return [] - diff --git a/solo_server/utils/server_utils.py b/solo_server/utils/server_utils.py index b5bfad1..247fe69 100644 --- a/solo_server/utils/server_utils.py +++ b/solo_server/utils/server_utils.py @@ -6,12 +6,10 @@ import subprocess from solo_server.config import CONFIG_PATH from solo_server.utils.nvidia import is_cuda_toolkit_installed -from solo_server.utils.llama_cpp_utils import start_llama_cpp_server, is_uv_available +from solo_server.utils.llama_cpp_utils import is_uv_available, start_llama_cpp_server def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None, os_name:str = None, port: int = 8000): """Setup vLLM server with Docker""" - typer.echo("\n🔧 Setting up vLLM server...") - # Initialize container_exists flag container_exists = False try: @@ -30,7 +28,7 @@ def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None typer.echo("✅ vLLM server is already setup!") return True else: - remove_container = typer.confirm("vLLM server already exists. Do you want to run with a new model?", default=True) + remove_container = typer.confirm("vLLM server already exists. Do you want to run with a new model?", default=False) if remove_container: subprocess.run(["docker", "rm", "solo-vllm"], check=True, capture_output=True) else: @@ -47,6 +45,11 @@ def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None subprocess.run(["docker", "pull", "rocm/vllm"], check=True) elif cpu == "Apple": subprocess.run(["docker", "pull", "getsolo/vllm-arm"], check=True) + elif cpu in ["Intel", "AMD"]: + subprocess.run(["docker", "pull", "getsolo/vllm-cpu"], check=True) + else: + typer.echo("❌ vLLM currently do not support your machine", err=True) + return False # Check if port is available try: @@ -68,8 +71,8 @@ def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None with open(CONFIG_PATH, 'r') as f: config = json.load(f) hf_token = config.get('hugging_face', {}).get('token', '') - - if not hf_token: # If not in config file, prompt user + + if not hf_token: if os_name in ["Linux", "Windows"]: typer.echo("Use Ctrl + Shift + V to paste your token.") hf_token = typer.prompt("Please add your HuggingFace token (Recommended)") @@ -121,6 +124,8 @@ def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None elif cpu == "Apple": docker_run_cmd.append("getsolo/vllm-arm") + elif cpu in ["Intel", "AMD"]: + docker_run_cmd.append("getsolo/vllm-cpu") else: typer.echo("❌ Solo server vLLM currently do not support your machine", err=True) return False @@ -128,7 +133,7 @@ def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None # Ask user for model name default_model = "meta-llama/Llama-3.2-1B-Instruct" model_name = typer.prompt(f"Enter the model name", default=default_model) - + # Add the model argument and additional parameters docker_run_cmd.append("--model") docker_run_cmd.append(model_name) @@ -141,6 +146,20 @@ def setup_vllm_server(gpu_enabled: bool, cpu: str = None, gpu_vendor: str = None typer.echo("🚀 Starting vLLM server...") subprocess.run(docker_run_cmd, check=True, capture_output=True) + # Check docker logs for any errors + try: + logs = subprocess.run( + ["docker", "logs", "solo-vllm"], + capture_output=True, + text=True, + check=True + ) + if logs.stderr: + typer.echo(f"⚠️ Server logs show errors:\n{logs.stderr}", err=True) + if logs.stdout: + typer.echo(f"Server logs:\n{logs.stdout}") + except subprocess.CalledProcessError as e: + typer.echo(f"❌ Failed to fetch docker logs: {e}", err=True) # Wait for container to be ready with timeout timeout = 30 @@ -197,7 +216,6 @@ def setup_ollama_server(gpu_enabled: bool = False, gpu_vendor: str = None, port: else: subprocess.run(["docker", "start", "solo-ollama"], check=True, capture_output=True) else: - typer.echo("\n🔧 Setting up Ollama server...") # Pull Ollama image typer.echo("📥 Pulling Ollama Registry...") subprocess.run(["docker", "pull", "ollama/ollama"], check=True) @@ -268,7 +286,13 @@ def setup_llama_cpp_server(gpu_enabled: bool, gpu_vendor: str = None, os_name: s gpu_vendor (str, optional): The GPU vendor (e.g., NVIDIA, AMD, Apple Silicon). os_name (str, optional): The name of the operating system. """ - typer.echo("\n🔧 Setting up llama_cpp server...") + # Check if llama-cpp-python is already installed + try: + import llama_cpp + typer.echo("✅ llama.cpp server is already installed") + return start_llama_cpp_server(os_name) + except ImportError: + typer.echo("Installing llama.cpp server...") # Check if llama-cpp-python is already installed try: @@ -280,7 +304,6 @@ def setup_llama_cpp_server(gpu_enabled: bool, gpu_vendor: str = None, os_name: s # Set CMAKE_ARGS based on hardware and OS cmake_args = [] - if gpu_enabled: if gpu_vendor == "NVIDIA": if not is_cuda_toolkit_installed(): @@ -292,7 +315,7 @@ def setup_llama_cpp_server(gpu_enabled: bool, gpu_vendor: str = None, os_name: s cmake_args.append("-DGGML_HIPBLAS=on") elif gpu_vendor == "Apple Silicon": cmake_args.append("-DGGML_METAL=on") - + cmake_args_str = " ".join(cmake_args) try: @@ -309,16 +332,15 @@ def setup_llama_cpp_server(gpu_enabled: bool, gpu_vendor: str = None, os_name: s installer_cmd = [sys.executable, "-m", "pip", "install", "--no-cache-dir", "llama-cpp-python[server]"] subprocess.check_call(installer_cmd, env=env) - try: if start_llama_cpp_server(os_name): - typer.echo("\n ✅ llama-cpp server is ready!") + typer.echo("\n ✅ llama.cpp server is ready!") + return True except Exception as e: - typer.echo(f"❌ Failed to start llama_cpp server: {e}", err=True) + typer.echo(f"❌ Failed to start llama.cpp server: {e}", err=True) return False - return True except subprocess.CalledProcessError as e: - typer.echo(f"❌ Failed to setup llama_cpp_python server: {e}", err=True) + typer.echo(f"❌ Failed to setup llama.cpp server: {e}", err=True) return False