diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile new file mode 100644 index 0000000..ac108d8 --- /dev/null +++ b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile @@ -0,0 +1,40 @@ +#--- +# name: transformers +# config: config.py +# group: llm +# depends: [pytorch, torchvision, huggingface_hub, rust] +# test: [test_version.py, huggingface-benchmark.py] +# docs: docs.md +# notes: for quantization support in Transformers, use the bitsandbytes, AutoGPTQ, or AutoAWQ containers. +#--- +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ARG TRANSFORMERS_PACKAGE=transformers \ + TRANSFORMERS_VERSION + +# if you want optimum[exporters,onnxruntime] see the optimum package + +RUN pip3 install --no-cache-dir --verbose accelerate && \ + pip3 install --no-cache-dir --verbose sentencepiece && \ + pip3 install --no-cache-dir --verbose optimum && \ + \ + # install from pypi, git, ect (sometimes other version got installed) + pip3 uninstall -y transformers && \ + \ + echo "Installing tranformers $TRANSFORMERS_VERSION (from $TRANSFORMERS_PACKAGE)" && \ + pip3 install --no-cache-dir --verbose ${TRANSFORMERS_PACKAGE} && \ + \ + # "/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py", line 118 + # AttributeError: module 'torch.distributed' has no attribute 'is_initialized' + PYTHON_ROOT=`pip3 show transformers | grep Location: | cut -d' ' -f2` && \ + sed -i \ + -e 's|torch.distributed.is_initialized|torch.distributed.is_available|g' \ + ${PYTHON_ROOT}/transformers/modeling_utils.py + +# add benchmark script +COPY huggingface-benchmark.py /usr/local/bin + +# make sure it loads +RUN pip3 show transformers \ + && python3 -c 'import transformers; print(transformers.__version__)' \ No newline at end of file diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile.vllm b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile.vllm new file mode 100644 index 0000000..e38f03c --- /dev/null +++ b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile.vllm @@ -0,0 +1,22 @@ +#--- +# name: vllm +# group: vlm +# config: config.py +# depends: [pytorch, torchvision, torchaudio, transformers, triton, xformers] +# requires: '>=34.1.0' +# test: test.py +# notes: https://github.com/vllm-project/vllm +#--- + ARG BASE_IMAGE + FROM ${BASE_IMAGE} + + ARG VLLM_VERSION \ + XGRAMMAR_VERSION \ + FORCE_BUILD=off + + RUN apt-get update -y && apt-get install -y libnuma-dev \ + libsndfile1 libsndfile1-dev libprotobuf-dev libsm6 libxext6 libgl1 + + COPY build.sh install.sh patches /tmp/vllm/ + + RUN /tmp/vllm/install.sh || /tmp/vllm/build.sh \ No newline at end of file diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/batch.py b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/batch.py new file mode 100644 index 0000000..f13a867 --- /dev/null +++ b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/batch.py @@ -0,0 +1,39 @@ +import litserve as ls +from transformers import pipeline + +class HuggingFaceLitAPI(ls.LitAPI): + def setup(self, device): + # Load the model and tokenizer from Hugging Face Hub + # For example, using the `distilbert-base-uncased-finetuned-sst-2-english` model for sentiment analysis + # You can replace the model name with any model from the Hugging Face Hub + model_name = "distilbert-base-uncased-finetuned-sst-2-english" + self.pipeline = pipeline("text-classification", model=model_name, device=device) + + def decode_request(self, request): + # Extract text from request + # This assumes the request payload is of the form: {'text': 'Your input text here'} + return request["text"] + + def batch(self, inputs): + # return the batched input list + return inputs + + def predict(self, texts): + # Use the loaded pipeline to perform inference + return self.pipeline(texts) + + def unbatch(self, outputs): + # unbatch the model output + return outputs + + def encode_response(self, output): + # Format the output from the model to send as a response + # This example sends back the label and score of the prediction + return {"label": output["label"], "score": output["score"]} + +if __name__ == "__main__": + # Create an instance of your API + api = HuggingFaceLitAPI() + # Start the server, specifying the port + server = ls.LitServer(api, accelerator="cuda", max_batch_size=16, workers_per_device=4, batch_timeout=0.01) + server.run(port=8000) \ No newline at end of file diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/client.py b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/client.py new file mode 100644 index 0000000..2036e28 --- /dev/null +++ b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/client.py @@ -0,0 +1,15 @@ +import requests + +def test_server(text): + # API endpoint URL + url = "http://127.0.0.1:8000/predict" + # Request payload + payload = {"text": text} + # POST request to the server + response = requests.post(url, json=payload) + # Print the response from the server + print(response.json()) + +if __name__ == "__main__": + sample_text = "I love machine learning. My experience with LitServe has been amazing!" + test_server(sample_text) \ No newline at end of file diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/requirements.txt b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/requirements.txt new file mode 100644 index 0000000..5473e9e --- /dev/null +++ b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/requirements.txt @@ -0,0 +1,15 @@ +accelerate +bitsandbytes +decord +litserve +openai +Pillow +qwen-vl-utils +streamlit +torch==2.4.0 +torchvision==0.19.0 +git+https://github.com/huggingface/transformers.git + +# Optional dependency +# Uncomment the following line if you need flash-attn +flash-attn==2.6.1 \ No newline at end of file diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/server.py b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/server.py new file mode 100644 index 0000000..f6f77ba --- /dev/null +++ b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/server.py @@ -0,0 +1,31 @@ +import litserve as ls +from transformers import pipeline + +class HuggingFaceLitAPI(ls.LitAPI): + def setup(self, device): + # Load the model and tokenizer from Hugging Face Hub + # For example, using the `distilbert-base-uncased-finetuned-sst-2-english` model for sentiment analysis + # You can replace the model name with any model from the Hugging Face Hub + model_name = "distilbert-base-uncased-finetuned-sst-2-english" + self.pipeline = pipeline("text-classification", model=model_name, device=0 if device=="gpu" else -1) + + def decode_request(self, request): + # Extract text from request + # This assumes the request payload is of the form: {'text': 'Your input text here'} + return request["text"] + + def predict(self, text): + # Use the loaded pipeline to perform inference + return self.pipeline(text) + + def encode_response(self, output): + # Format the output from the model to send as a response + # This example sends back the label and score of the prediction + return {"label": output[0]["label"], "score": output[0]["score"]} + +if __name__ == "__main__": + # Create an instance of your API + api = HuggingFaceLitAPI() + # Start the server, specifying the port + server = ls.LitServer(api, accelerator="cuda") + server.run(port=8000) \ No newline at end of file diff --git a/solo_server/examples/merged/qwen2.5/server.py b/solo_server/examples/merged/qwen2.5/server.py new file mode 100644 index 0000000..2ec0d91 --- /dev/null +++ b/solo_server/examples/merged/qwen2.5/server.py @@ -0,0 +1,35 @@ +from fastapi import HTTPException +from stock_researcher import research_financials, research_news, stock_researcher, load_model +import litserve as ls + +model_id = "Qwen/Qwen2.5-7B-Instruct" + +class StockAnalyst(ls.LitAPI): + def setup(self, device): + # Using a self hosted open-source model with OpenAI API compatible interface + self.model = model_id + + def decode_request(self, request: dict): + # Query containing the stock name to research + return request["query"] + + def predict(self, query: str): + try: + # 1. Find financial info + messages, financials = research_financials(query, self.model) + # 2. Research news about stocks + tool_calls, tool_final_result = research_news(financials, query, self.model) + # 3. Analyze data + yield from stock_researcher(tool_final_result, tool_calls, messages, self.model) + except Exception as e: + raise HTTPException(status_code=500, detail="Stock analyst ran into an error") + + def encode_response(self, response): + for chunk in response: + yield chunk + +if __name__ == "__main__": + load_model(model_id) + api = StockAnalyst() + server = ls.LitServer(api, workers_per_device=8, accelerator="cpu", timeout=False, stream=True) + server.run(port=8888) \ No newline at end of file diff --git a/solo_server/examples/solo.yaml b/solo_server/examples/solo.yaml new file mode 100644 index 0000000..d460fbf --- /dev/null +++ b/solo_server/examples/solo.yaml @@ -0,0 +1,23 @@ +domain: "education" +hardware: + accelerator: "cpu" + workers_per_device: 8 + timeout: false + stream: true +default-llm: "gpt-4" +models: + interest_tags: + - fast + - balanced + - innovative + prompt_seed: "Model prompt seed based on interests: fast, balanced, innovative" +docker: + image: "your_docker_image" + port: 5070 +huggingface: + cache_dir: "huggingface" + token: "YOUR_HUGGINGFACE_API_TOKEN" +soloconfig: + path: "soloconfig.py" + modelpath: "models" + storage_limit_gb: 20