diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4c42a02 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,108 @@ +# Dockerfile now located in the project root directory + +# Use an official Python runtime as a parent image +FROM python:3.10-slim + +# Add parameter for PyTorch version with a default empty value +ARG TORCH_VERSION="" + + +# Set the working directory in the container +WORKDIR /app + +# Install system dependencies for eSpeak and other requirements (removed git) +RUN apt-get update && apt-get install -y \ + espeak-ng \ + && rm -rf /var/lib/apt/lists/* + +# Copy application code from the repository root (build context) to /app +# This command now copies the root requirements.txt directly into /app +COPY . . + +# Install Rust compiler if needed for better cross-compiling support +#RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +#ENV PATH="/root/.cargo/bin:${PATH}" + + +# --- PyTorch Installation Logic (Relies on the root requirements.txt copied to /app) --- + +# Extract torch-related versions from the root requirements.txt (now at /app/requirements.txt) +RUN TORCH_VERSION_REQ=$(grep -E "^torch==" requirements.txt | cut -d'=' -f3 || echo "") && \ + TORCHAUDIO_VERSION_REQ=$(grep -E "^torchaudio==" requirements.txt | cut -d'=' -f3 || echo "") && \ + TORCHVISION_VERSION_REQ=$(grep -E "^torchvision==" requirements.txt | cut -d'=' -f3 || echo "") && \ + TORCHMETRICS_VERSION_REQ=$(grep -E "^torchmetrics==" requirements.txt | cut -d'=' -f3 || echo "") && \ + echo "Found in requirements: torch==$TORCH_VERSION_REQ, torchaudio==$TORCHAUDIO_VERSION_REQ, torchvision==$TORCHVISION_VERSION_REQ, torchmetrics==$TORCHMETRICS_VERSION_REQ" + +# Install PyTorch and related packages based on TORCH_VERSION build-arg +RUN if [ ! -z "$TORCH_VERSION" ]; then \ + # Check if we need to use specific versions from requirements.txt or get the latest versions + if [ ! -z "$TORCH_VERSION_REQ" ] && [ ! -z "$TORCHVISION_VERSION_REQ" ] && [ ! -z "$TORCHAUDIO_VERSION_REQ" ] && [ ! -z "$TORCHMETRICS_VERSION_REQ" ]; then \ + echo "Using specific versions from requirements.txt" && \ + TORCH_SPEC="torch==${TORCH_VERSION_REQ}" && \ + TORCHVISION_SPEC="torchvision==${TORCHVISION_VERSION_REQ}" && \ + TORCHAUDIO_SPEC="torchaudio==${TORCHAUDIO_VERSION_REQ}" && \ + TORCHMETRICS_SPEC="torchmetrics==${TORCHMETRICS_VERSION_REQ}"; \ + else \ + echo "Using latest versions for the selected variant" && \ + TORCH_SPEC="torch" && \ + TORCHVISION_SPEC="torchvision" && \ + TORCHAUDIO_SPEC="torchaudio" && \ + TORCHMETRICS_SPEC="torchmetrics"; \ + fi && \ + \ + case "$TORCH_VERSION" in \ + "cuda12") \ + pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC $TORCHMETRICS_SPEC --extra-index-url https://download.pytorch.org/whl/cu121 \ + ;; \ + "cuda128") \ + pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC $TORCHMETRICS_SPEC --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \ + ;; \ + "cuda11") \ + pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC $TORCHMETRICS_SPEC --extra-index-url https://download.pytorch.org/whl/cu118 \ + ;; \ + "rocm") \ + pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC $TORCHMETRICS_SPEC --extra-index-url https://download.pytorch.org/whl/rocm6.2 \ + ;; \ + "xpu") \ + pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC $TORCHMETRICS_SPEC && \ + pip install --no-cache-dir intel-extension-for-pytorch --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ + ;; \ + "cpu") \ + pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC $TORCHMETRICS_SPEC --extra-index-url https://download.pytorch.org/whl/cpu \ + ;; \ + *) \ + pip install --no-cache-dir $TORCH_VERSION \ + ;; \ + esac && \ + # Install remaining requirements, skipping lines for all forced torch packages + # This reads the root requirements.txt (at /app/requirements.txt) + echo "Installing remaining dependencies from requirements.txt..." && \ + grep -v -E "^torch==|^torchvision==|^torchaudio==|^torchmetrics==" requirements.txt > requirements_no_torch.txt && \ + if [ -s requirements_no_torch.txt ]; then \ + pip install --no-cache-dir --upgrade -r requirements_no_torch.txt; \ + else \ + echo "No remaining dependencies to install."; \ + fi && \ + rm requirements_no_torch.txt; \ + else \ + # Install all requirements as specified if no specific TORCH_VERSION is provided + # This reads the root requirements.txt (at /app/requirements.txt) + echo "TORCH_VERSION not specified, installing all dependencies from requirements.txt..." && \ + pip install --no-cache-dir --upgrade -r requirements.txt; \ + fi + +# --- End PyTorch Installation Logic --- + +# Set environment variables for eSpeak (if needed) +ENV PHONEMIZER_ESPEAK_LIBRARY=/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1 +ENV PHONEMIZER_ESPEAK_PATH=/usr/bin + +# Expose any necessary ports (if applicable) +EXPOSE 8000 + +# Create a volume for input/output files +VOLUME ["/app/input", "/app/output"] + +# Set the default command to run when starting the container +# You might want to modify this based on specific inference scripts +CMD ["bash"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..2c64815 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,30 @@ +version: '3.8' +services: + diffrhythm: + build: + # Context remains '..' because docker-compose is likely run from the 'docker' dir, + # and the build context (where source files are) IS the parent directory (project root). + context: .. + # Dockerfile path is now relative to the context ('..'). + # Since the Dockerfile is directly in the root, the path is just 'Dockerfile'. + dockerfile: Dockerfile + # args: + # TORCH_VERSION: cuda12 # TORCH_VERSION Options = cuda12, cuda128, cuda11, rocm, xpu, cpu + image: diffrhythm + container_name: diffrhythm +# volumes: +# - ./output:/app/output # Example: Mount output relative to docker-compose file location +# - ../infer:/app/infer # Example: Mount infer relative to project root + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + stdin_open: true + tty: true + ports: + - 8000:8000 + # Keep container running + command: ["/bin/bash", "-c", "tail -f /dev/null"] diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 7351db4..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -# Use an official Python runtime as a parent image -FROM python:3.10-slim - -# Set the working directory in the container -WORKDIR /app - -# Install system dependencies for eSpeak and other requirements -RUN apt-get update && apt-get install -y \ - git \ - espeak-ng \ - && rm -rf /var/lib/apt/lists/* - -# Clone the DiffRhythm repository -RUN git clone https://github.com/ASLP-lab/DiffRhythm.git . - -# Copy the requirements file into the container -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Set environment variables for eSpeak (if needed) -ENV PHONEMIZER_ESPEAK_LIBRARY=/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1 -ENV PHONEMIZER_ESPEAK_PATH=/usr/bin - -# Expose any necessary ports (if applicable) -# EXPOSE 8000 - -# Create a volume for input/output files -VOLUME ["/app/input", "/app/output"] - -# Set the default command to run when starting the container -# You might want to modify this based on specific inference scripts -CMD ["bash"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index 089036a..0000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,21 +0,0 @@ -version: '3.8' -services: - diffrhythm: - build: - context: . - dockerfile: Dockerfile - image: diffrhythm - container_name: diffrhythm -# volumes: -# - ./scripts:/app/scripts -# - ./output:/app/output - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - stdin_open: true - tty: true - command: ["/bin/bash", "-c", "tail -f /dev/null"] diff --git a/docker/requirements.txt b/docker/requirements.txt deleted file mode 100644 index a974756..0000000 --- a/docker/requirements.txt +++ /dev/null @@ -1,157 +0,0 @@ -accelerate==1.4.0 -aiofiles==23.2.1 -aiohappyeyeballs==2.6.1 -aiohttp==3.11.14 -aiosignal==1.3.2 -annotated-types==0.7.0 -anyio==4.9.0 -attrs==25.3.0 -audioread==3.0.1 -babel==2.17.0 -beartype==0.20.2 -bitsandbytes==0.45.3 -certifi==2025.1.31 -cffi==1.17.1 -charset-normalizer==3.4.1 -click==8.1.8 -cn2an==0.5.23 -colorama==0.4.6 -coloredlogs==15.0.1 -configparser==7.2.0 -csvw==3.5.1 -decorator==5.2.1 -Deprecated==1.2.18 -dlinfo==2.0.0 -docker-pycreds==0.4.0 -easydict==1.13 -einops==0.8.1 -einx==0.3.0 -ema-pytorch==0.7.7 -fastapi==0.115.12 -ffmpy==0.5.0 -filelock==3.18.0 -flatbuffers==25.2.10 -frozendict==2.4.6 -frozenlist==1.5.0 -fsspec==2025.3.0 -ftfy==6.3.1 -gin-config==0.5.0 -gitdb==4.0.12 -GitPython==3.1.44 -gradio==5.22.0 -gradio_client==1.8.0 -groovy==0.1.2 -h11==0.14.0 -httpcore==1.0.7 -httpx==0.28.1 -huggingface-hub==0.29.3 -humanfriendly==10.0 -idna==3.10 -inflect==7.5.0 -isodate==0.7.2 -jaconv==0.4.0 -jieba==0.42.1 -Jinja2==3.1.6 -joblib==1.4.2 -jsonschema==4.23.0 -jsonschema-specifications==2024.10.1 -language-tags==1.2.0 -lazy_loader==0.4 -librosa==0.10.2.post1 -lightning-utilities==0.14.2 -llvmlite==0.44.0 -loguru==0.7.3 -markdown-it-py==3.0.0 -MarkupSafe==3.0.2 -mdurl==0.1.2 -more-itertools==10.6.0 -mpmath==1.3.0 -msgpack==1.1.0 -multidict==6.2.0 -muq==0.1.0 -mutagen==1.47.0 -networkx==3.4.2 -nnAudio==0.3.3 -numba==0.61.0 -numpy==2.1.3 -onnxruntime-gpu==1.21.0 -orjson==3.10.16 -packaging==24.2 -pandas==2.2.3 -phonemizer==3.3.0 -pillow==11.1.0 -platformdirs==4.3.7 -pooch==1.8.2 -prefigure==0.0.10 -proces==0.1.7 -propcache==0.3.0 -protobuf==5.29.4 -psutil==7.0.0 -py3langid==0.3.0 -pyarrow==19.0.1 -pycparser==2.22 -pydantic==2.10.6 -pydantic_core==2.27.2 -pydub==0.25.1 -Pygments==2.19.1 -pykakasi==2.3.0 -pylance==0.23.2 -pyparsing==3.2.2 -pypinyin==0.53.0 -pyreadline3==3.5.4 -python-dateutil==2.9.0.post0 -python-multipart==0.0.20 -pytorch-lightning==2.5.1 -pytz==2025.1 -PyYAML==6.0.2 -rdflib==7.1.3 -referencing==0.36.2 -regex==2024.11.6 -requests==2.32.3 -rfc3986==1.5.0 -rich==13.9.4 -rpds-py==0.23.1 -ruff==0.11.2 -safehttpx==0.1.6 -safetensors==0.5.3 -scikit-learn==1.6.1 -scipy==1.15.2 -segments==2.3.0 -semantic-version==2.10.0 -sentry-sdk==2.24.1 -setproctitle==1.3.5 -setuptools==78.0.1 -shellingham==1.5.4 -six==1.17.0 -smmap==5.0.2 -sniffio==1.3.1 -soundfile==0.13.1 -soxr==0.5.0.post1 -starlette==0.46.1 -sympy==1.13.1 -threadpoolctl==3.6.0 -tokenizers==0.21.1 -tomlkit==0.13.2 -torch==2.6.0 -torchaudio==2.6.0 -torchdiffeq==0.2.5 -torchmetrics==1.7.0 -torchvision==0.21.0 -tqdm==4.67.1 -transformers==4.49.0 -typeguard==4.4.2 -typer==0.15.2 -typing_extensions==4.12.2 -tzdata==2025.2 -Unidecode==1.3.8 -uritemplate==4.1.1 -urllib3==2.3.0 -uvicorn==0.34.0 -wandb==0.19.8 -wcwidth==0.2.13 -websockets==15.0.1 -win32_setctime==1.2.0 -wrapt==1.17.2 -x-clip==0.14.4 -x-transformers==2.1.2 -yarl==1.18.3 diff --git a/gradio/app.py b/gradio/app.py new file mode 100644 index 0000000..b6aece2 --- /dev/null +++ b/gradio/app.py @@ -0,0 +1,458 @@ +import gradio as gr +from openai import OpenAI +import requests +import json +# from volcenginesdkarkruntime import Ark +import torch +import torchaudio +from einops import rearrange +import argparse +import json +import os +import spaces +from tqdm import tqdm +import random +import numpy as np +import sys +import base64 + + +# Only for the infer module import +current_dir = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.dirname(current_dir) # Go up one level +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# Now your import should work +from infer.infer_utils import ( + get_reference_latent, + get_lrc_token, + get_audio_style_prompt, + get_text_style_prompt, + prepare_model, + get_negative_style_prompt +) + +from infer.infer import inference + +MAX_SEED = np.iinfo(np.int32).max +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print(device) + + +# --- CORRECTED MODEL LOADING --- +MAX_SEED = np.iinfo(np.int32).max +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print(f"Using device: {device}") + +# Define the max_frames for each model duration +max_frames_short = 2048 # Corresponds to 95s +max_frames_long = 6144 # Corresponds to 285s + +# Load the standard model (95s) and common components +print("Loading base model (95s) and components...") +cfm, tokenizer, muq, vae = prepare_model(max_frames=max_frames_short, device=device) +print("Compiling base model...") +cfm = torch.compile(cfm) + +# Load only the full CFM model (285s) - reuse other components +print("Loading full model (285s)...") +# We only need the first return value (the cfm model) from this call +cfm_full, _, _, _ = prepare_model(max_frames=max_frames_long, device=device) +print("Compiling full model...") +cfm_full = torch.compile(cfm_full) + +print("Models loaded and compiled.") +# --- END OF CORRECTED LOADING --- + + +#cfm, cfm_full, tokenizer, muq, vae = prepare_model(device) +#cfm = torch.compile(cfm) +#cfm_full = torch.compile(cfm_full) + + +@spaces.GPU(duration=40) +def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, cfg_strength=4.0, file_type='wav', odeint_method='euler', Music_Duration='95s', device='cuda'): + if Music_Duration == '95s': + max_frames = 2048 + cfm_model = cfm + else: + max_frames = 6144 + cfm_model = cfm_full + if randomize_seed: + seed = random.randint(0, MAX_SEED) + torch.manual_seed(seed) + sway_sampling_coef = -1 if steps < 32 else None + vocal_flag = False + try: + lrc_prompt, start_time = get_lrc_token(max_frames, lrc, tokenizer, device) + if current_prompt_type == 'audio': + style_prompt, vocal_flag = get_audio_style_prompt(muq, ref_audio_path) + else: + style_prompt = get_text_style_prompt(muq, text_prompt) + except Exception as e: + raise gr.Error(f"Error: {str(e)}") + negative_style_prompt = get_negative_style_prompt(device) + latent_prompt = get_reference_latent(device, max_frames) + generated_song = inference(cfm_model=cfm_model, + vae_model=vae, + cond=latent_prompt, + text=lrc_prompt, + duration=max_frames, + style_prompt=style_prompt, + negative_style_prompt=negative_style_prompt, + steps=steps, + cfg_strength=cfg_strength, + sway_sampling_coef=sway_sampling_coef, + start_time=start_time, + file_type=file_type, + vocal_flag=vocal_flag, + odeint_method=odeint_method, + ) + return generated_song + +def R1_infer1(theme, tags_gen, language): + try: + client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3") + + llm_prompt = """ + 请围绕"{theme}"主题生成一首符合"{tags}"风格的语言为{language}的完整歌词。严格遵循以下要求: + + ### **强制格式规则** + 1. **仅输出时间戳和歌词**,禁止任何括号、旁白、段落标记(如副歌、间奏、尾奏等注释)。 + 2. 每行格式必须为 `[mm:ss.xx]歌词内容`,时间戳与歌词间无空格,歌词内容需完整连贯。 + 3. 时间戳需自然分布,**第一句歌词起始时间不得为 [00:00.00]**,需考虑前奏空白。 + + ### **内容与结构要求** + 1. 歌词应富有变化,使情绪递进,整体连贯有层次感。**每行歌词长度应自然变化**,切勿长度一致,导致很格式化。 + 2. **时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测**,而非机械地按照歌词长度分配。 + 3. 间奏/尾奏仅通过时间空白体现(如从 [02:30.00] 直接跳至 [02:50.00]),**无需文字描述**。 + + ### **负面示例(禁止出现)** + - 错误:[01:30.00](钢琴间奏) + - 错误:[02:00.00][副歌] + - 错误:空行、换行符、注释 + """ + + response = client.chat.completions.create( + model="ep-20250304144033-nr9wl", + messages=[ + {"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."}, + {"role": "user", "content": llm_prompt.format(theme=theme, tags=tags_gen, language=language)}, + ], + stream=False + ) + + info = response.choices[0].message.content + + return info + + except requests.exceptions.RequestException as e: + print(f'请求出错: {e}') + return {} + + + +def R1_infer2(tags_lyrics, lyrics_input): + client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3") + + llm_prompt = """ + {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格,我现在想要给这首歌的每一句歌词打时间戳得到LRC,我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测,而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度,避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词,每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。 + """ + + response = client.chat.completions.create( + model="ep-20250304144033-nr9wl", + messages=[ + {"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."}, + {"role": "user", "content": llm_prompt.format(lyrics_input=lyrics_input, tags_lyrics=tags_lyrics)}, + ], + stream=False + ) + + info = response.choices[0].message.content + + return info + +css = """ +/* 固定文本域高度并强制滚动条 */ +.lyrics-scroll-box textarea { + height: 405px !important; /* 固定高度 */ + max-height: 500px !important; /* 最大高度 */ + overflow-y: auto !important; /* 垂直滚动 */ + white-space: pre-wrap; /* 保留换行 */ + line-height: 1.5; /* 行高优化 */ +} + +.gr-examples { + background: transparent !important; + border: 1px solid #e0e0e0 !important; + border-radius: 8px; + margin: 1rem 0 !important; + padding: 1rem !important; +} + +""" + + +with gr.Blocks(css=css) as demo: + gr.HTML(f""" +
+ +
+ +
+
+ Di♪♪Rhythm (谛韵) +
+
+ + + + + + + + + +
+
+ """) + + with gr.Tabs() as tabs: + + # page 1 + with gr.Tab("Music Generate", id=0): + with gr.Row(): + with gr.Column(): + lrc = gr.Textbox( + label="Lyrics", + placeholder="Input the full lyrics", + lines=12, + max_lines=50, + elem_classes="lyrics-scroll-box", + value="""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it\n[01:34.95]And that I'm enough\n[01:37.91]I need it and I don't know why\n[01:42.08]This late at night\n[01:44.24]Isn't it lonely\n[01:47.18]I'd do anything to make you want me\n[01:51.30]I'd give it all up if you told me\n[01:55.32]That I'd be\n[01:57.35]The number one girl in your eyes\n[02:00.72]Your one and only\n[02:03.57]So what's it gon' take for you to want me\n[02:07.78]I'd give it all up if you told me\n[02:11.74]That I'd be\n[02:13.86]The number one girl in your eyes\n[02:17.03]The girl in your eyes\n[02:21.05]The girl in your eyes\n[02:26.30]Tell me I'm the number one girl\n[02:28.44]I'm the number one girl in your eyes\n[02:33.49]The girl in your eyes\n[02:37.58]The girl in your eyes\n[02:42.74]Tell me I'm the number one girl\n[02:44.88]I'm the number one girl in your eyes\n[02:49.91]Well isn't it lonely\n[02:53.19]I'd do anything to make you want me\n[02:57.10]I'd give it all up if you told me\n[03:01.15]That I'd be\n[03:03.31]The number one girl in your eyes\n[03:06.57]Your one and only\n[03:09.42]So what's it gon' take for you to want me\n[03:13.50]I'd give it all up if you told me\n[03:17.56]That I'd be\n[03:19.66]The number one girl in your eyes\n[03:25.74]The number one girl in your eyes""" + ) + + current_prompt_type = gr.State(value="audio") + with gr.Tabs() as inside_tabs: + with gr.Tab("Audio Prompt"): + audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./src/prompt/default.wav") + with gr.Tab("Text Prompt"): + text_prompt = gr.Textbox( + label="Text Prompt", + placeholder="Enter the Text Prompt, eg: emotional piano pop", + ) + def update_prompt_type(evt: gr.SelectData): + return "audio" if evt.index == 0 else "text" + + inside_tabs.select( + fn=update_prompt_type, + outputs=current_prompt_type + ) + + with gr.Column(): + with gr.Accordion("Best Practices Guide", open=True): + gr.Markdown(""" +1. **Lyrics Format Requirements** + - Each line must follow: `[mm:ss.xx]Lyric content` + - Example of valid format: + ``` + [00:10.00]Moonlight spills through broken blinds + [00:13.20]Your shadow dances on the dashboard shrine + ``` + +2. **Audio Prompt Requirements** + - Reference audio should be ≥ 1 second, audio >10 seconds will be randomly clipped into 10 seconds + - For optimal results, the 10-second clips should be carefully selected + - Shorter clips may lead to incoherent generation +3. **Supported Languages** + - **Chinese and English** + - More languages comming soon + +4. **Others** + - If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings. + + """) + Music_Duration = gr.Radio(["95s", "285s"], label="Music Duration", value="95s") + + lyrics_btn = gr.Button("Generate", variant="primary") + audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output") + with gr.Accordion("Advanced Settings", open=False): + seed = gr.Slider( + label="Seed", + minimum=0, + maximum=MAX_SEED, + step=1, + value=0, + ) + randomize_seed = gr.Checkbox(label="Randomize seed", value=True) + + steps = gr.Slider( + minimum=10, + maximum=100, + value=32, + step=1, + label="Diffusion Steps", + interactive=True, + elem_id="step_slider" + ) + cfg_strength = gr.Slider( + minimum=1, + maximum=10, + value=4.0, + step=0.5, + label="CFG Strength", + interactive=True, + elem_id="step_slider" + ) + odeint_method = gr.Radio(["euler", "midpoint", "rk4","implicit_adams"], label="ODE Solver", value="euler") + file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="wav") + + + gr.Examples( + examples=[ + ["./src/prompt/pop_cn.wav"], + ["./src/prompt/pop_en.wav"], + ["./src/prompt/rock_cn.wav"], + ["./src/prompt/rock_en.wav"], + ["./src/prompt/country_cn.wav"], + ["./src/prompt/country_en.wav"], + ["./src/prompt/classic_cn.wav"], + ["./src/prompt/classic_en.wav"], + ["./src/prompt/jazz_cn.wav"], + ["./src/prompt/jazz_en.wav"], + ["./src/prompt/rap_cn.wav"], + ["./src/prompt/rap_en.wav"], + ["./src/prompt/default.wav"] + ], + inputs=[audio_prompt], + label="Audio Examples", + examples_per_page=13, + elem_id="audio-examples-container" + ) + + gr.Examples( + examples=[ + ["Pop Emotional Piano"], + ["流行 情感 钢琴"], + ["Indie folk ballad, coming-of-age themes, acoustic guitar picking with harmonica interludes"], + ["独立民谣, 成长主题, 原声吉他弹奏与口琴间奏"] + ], + inputs=[text_prompt], + label="Text Examples", + examples_per_page=4, + elem_id="text-examples-container" + ) + + gr.Examples( + examples=[ + ["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it\n[01:34.95]And that I'm enough\n[01:37.91]I need it and I don't know why\n[01:42.08]This late at night\n[01:44.24]Isn't it lonely\n[01:47.18]I'd do anything to make you want me\n[01:51.30]I'd give it all up if you told me\n[01:55.32]That I'd be\n[01:57.35]The number one girl in your eyes\n[02:00.72]Your one and only\n[02:03.57]So what's it gon' take for you to want me\n[02:07.78]I'd give it all up if you told me\n[02:11.74]That I'd be\n[02:13.86]The number one girl in your eyes\n[02:17.03]The girl in your eyes\n[02:21.05]The girl in your eyes\n[02:26.30]Tell me I'm the number one girl\n[02:28.44]I'm the number one girl in your eyes\n[02:33.49]The girl in your eyes\n[02:37.58]The girl in your eyes\n[02:42.74]Tell me I'm the number one girl\n[02:44.88]I'm the number one girl in your eyes\n[02:49.91]Well isn't it lonely\n[02:53.19]I'd do anything to make you want me\n[02:57.10]I'd give it all up if you told me\n[03:01.15]That I'd be\n[03:03.31]The number one girl in your eyes\n[03:06.57]Your one and only\n[03:09.42]So what's it gon' take for you to want me\n[03:13.50]I'd give it all up if you told me\n[03:17.56]That I'd be\n[03:19.66]The number one girl in your eyes\n[03:25.74]The number one girl in your eyes"""], + ["""[00:00.52]Abracadabra abracadabra\n[00:03.97]Ha\n[00:04.66]Abracadabra abracadabra\n[00:12.02]Yeah\n[00:15.80]Pay the toll to the angels\n[00:19.08]Drawin' circles in the clouds\n[00:23.31]Keep your mind on the distance\n[00:26.67]When the devil turns around\n[00:30.95]Hold me in your heart tonight\n[00:34.11]In the magic of the dark moonlight\n[00:38.44]Save me from this empty fight\n[00:43.83]In the game of life\n[00:45.84]Like a poem said by a lady in red\n[00:49.45]You hear the last few words of your life\n[00:53.15]With a haunting dance now you're both in a trance\n[00:56.90]It's time to cast your spell on the night\n[01:01.40]Abracadabra ama-ooh-na-na\n[01:04.88]Abracadabra porta-ooh-ga-ga\n[01:08.92]Abracadabra abra-ooh-na-na\n[01:12.30]In her tongue she's sayin'\n[01:14.76]Death or love tonight\n[01:18.61]Abracadabra abracadabra\n[01:22.18]Abracadabra abracadabra\n[01:26.08]Feel the beat under your feet\n[01:27.82]The floor's on fire\n[01:29.90]Abracadabra abracadabra\n[01:33.78]Choose the road on the west side\n[01:37.09]As the dust flies watch it burn\n[01:41.45]Don't waste time on feeling\n[01:44.64]Your depression won't return\n[01:49.15]Hold me in your heart tonight\n[01:52.21]In the magic of the dark moonlight\n[01:56.54]Save me from this empty fight\n[02:01.77]In the game of life\n[02:03.94]Like a poem said by a lady in red\n[02:07.52]You hear the last few words of your life\n[02:11.19]With a haunting dance now you're both in a trance\n[02:14.95]It's time to cast your spell on the night\n[02:19.53]Abracadabra ama-ooh-na-na\n[02:22.71]Abracadabra porta-ooh-ga-ga\n[02:26.94]Abracadabra abra-ooh-na-na\n[02:30.42]In her tongue she's sayin'\n[02:32.83]Death or love tonight\n[02:36.55]Abracadabra abracadabra\n[02:40.27]Abracadabra abracadabra\n[02:44.19]Feel the beat under your feet\n[02:46.14]The floor's on fire\n[02:47.95]Abracadabra abracadabra\n[02:51.17]Phantom of the dance floor come to me\n[02:58.46]Sing for me a sinful melody\n[03:06.51]Ah-ah-ah-ah-ah ah-ah ah-ah\n[03:13.76]Ah-ah-ah-ah-ah ah-ah ah-ah\n[03:22.39]Abracadabra ama-ooh-na-na\n[03:25.66]Abracadabra porta-ooh-ga-ga\n[03:29.87]Abracadabra abra-ooh-na-na\n[03:33.16]In her tongue she's sayin'\n[03:35.55]Death or love tonight"""], + ["""[00:00.27]只因你太美 baby 只因你太美 baby\n[00:08.95]只因你实在是太美 baby\n[00:13.99]只因你太美 baby\n[00:18.89]迎面走来的你让我如此蠢蠢欲动\n[00:20.88]这种感觉我从未有\n[00:21.79]Cause I got a crush on you who you\n[00:25.74]你是我的我是你的谁\n[00:28.09]再多一眼看一眼就会爆炸\n[00:30.31]再近一点靠近点快被融化\n[00:32.49]想要把你占为己有 baby bae\n[00:34.60]不管走到哪里\n[00:35.44]都会想起的人是你 you you\n[00:38.12]我应该拿你怎样\n[00:39.61]Uh 所有人都在看着你\n[00:42.36]我的心总是不安\n[00:44.18]Oh 我现在已病入膏肓\n[00:46.63]Eh oh\n[00:47.84]难道真的因你而疯狂吗\n[00:51.57]我本来不是这种人\n[00:53.59]因你变成奇怪的人\n[00:55.77]第一次呀变成这样的我\n[01:01.23]不管我怎么去否认\n[01:03.21]只因你太美 baby 只因你太美 baby\n[01:11.46]只因你实在是太美 baby\n[01:16.75]只因你太美 baby\n[01:21.09]Oh eh oh\n[01:22.82]现在确认地告诉我\n[01:25.26]Oh eh oh\n[01:27.31]你到底属于谁\n[01:29.98]Oh eh oh\n[01:31.70]现在确认地告诉我\n[01:34.45]Oh eh oh\n[01:36.35]你到底属于谁\n[01:37.65]就是现在告诉我\n[01:40.00]跟着那节奏 缓缓 make wave\n[01:42.42]甜蜜的奶油 it's your birthday cake\n[01:44.66]男人们的 game call me 你恋人\n[01:46.83]别被欺骗愉快的 I wanna play\n[01:48.83]我的脑海每分每秒为你一人沉醉\n[01:50.90]最迷人让我神魂颠倒是你身上香水\n[01:53.30]Oh right baby I'm fall in love with you\n[01:55.20]我的一切你都拿走\n[01:56.40]只要有你就已足够\n[01:58.56]我到底应该怎样\n[02:00.37]Uh 我心里一直很不安\n[02:03.12]其他男人们的视线\n[02:04.84]Oh 全都只看着你的脸\n[02:07.33]Eh oh\n[02:08.39]难道真的因你而疯狂吗\n[02:12.43]我本来不是这种人\n[02:14.35]因你变成奇怪的人\n[02:16.59]第一次呀变成这样的我\n[02:21.76]不管我怎么去否认\n[02:24.03]只因你太美 baby 只因你太美 baby\n[02:32.37]只因你实在是太美 baby\n[02:37.49]只因你太美 baby\n[02:43.66]我愿意把我的全部都给你\n[02:47.19]我每天在梦里都梦见你\n[02:49.13]还有我闭着眼睛也能看到你\n[02:52.58]现在开始我只准你看我\n[02:56.28]I don't wanna wake up in dream\n[02:57.92]我只想看你这是真心话\n[02:59.86]只因你太美 baby 只因你太美 baby\n[03:08.20]只因你实在是太美 baby\n[03:13.22]只因你太美 baby\n[03:17.69]Oh eh oh\n[03:19.36]现在确认的告诉我\n[03:21.91]Oh eh oh\n[03:23.85]你到底属于谁\n[03:26.58]Oh eh oh\n[03:28.32]现在确认的告诉我\n[03:30.95]Oh eh oh\n[03:32.82]你到底属于谁就是现在告诉我"""] + ], + + inputs=[lrc], + label="Lrc Examples", + examples_per_page=3, + elem_id="lrc-examples-container", + ) + + + # page 2 + with gr.Tab("Lyrics Generate", id=1): + with gr.Row(): + with gr.Column(): + with gr.Accordion("Notice", open=False): + gr.Markdown("**Two Generation Modes:**\n1. Generate from theme & tags\n2. Add timestamps to existing lyrics") + + with gr.Group(): + gr.Markdown("### Method 1: Generate from Theme") + theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g: Love and Heartbreak") + tags_gen = gr.Textbox(label="tags", placeholder="Enter song tags, e.g: pop confidence healing") + language = gr.Radio(["cn", "en"], label="Language", value="en") + gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary") + + gr.Examples( + examples=[ + [ + "Love and Heartbreak", + "vocal emotional piano pop", + "en" + ], + [ + "Heroic Epic", + "choir orchestral powerful", + "cn" + ] + ], + inputs=[theme, tags_gen, language], + label="Examples: Generate from Theme" + ) + + with gr.Group(visible=True): + gr.Markdown("### Method 2: Add Timestamps to Lyrics") + tags_lyrics = gr.Textbox(label="tags", placeholder="Enter song tags, e.g: ballad piano slow") + lyrics_input = gr.Textbox( + label="Raw Lyrics (without timestamps)", + placeholder="Enter plain lyrics (without timestamps), e.g:\nYesterday\nAll my troubles...", + lines=10, + max_lines=50, + elem_classes="lyrics-scroll-box" + ) + + gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary") + + gr.Examples( + examples=[ + [ + "acoustic folk happy", + """I'm sitting here in the boring room\nIt's just another rainy Sunday afternoon""" + ], + [ + "electronic dance energetic", + """We're living in a material world\nAnd I am a material girl""" + ] + ], + inputs=[tags_lyrics, lyrics_input], + label="Examples: Generate from Lyrics" + ) + + + with gr.Column(): + lrc_output = gr.Textbox( + label="Generated LRC", + placeholder="Timed lyrics will appear here", + lines=57, + elem_classes="lrc-output", + show_copy_button=True + ) + + # Bind functions + gen_from_theme_btn.click( + fn=R1_infer1, + inputs=[theme, tags_gen, language], + outputs=lrc_output + ) + + gen_from_lyrics_btn.click( + fn=R1_infer2, + inputs=[tags_lyrics, lyrics_input], + outputs=lrc_output + ) + + tabs.select( + lambda s: None, + None, + None + ) + + lyrics_btn.click( + fn=infer_music, + inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, cfg_strength, file_type, odeint_method, Music_Duration], + outputs=audio_output + ) + + +if __name__ == "__main__": + demo.launch() diff --git a/gradio/src/DiffRhythm.jpg b/gradio/src/DiffRhythm.jpg new file mode 100644 index 0000000..8d472ce Binary files /dev/null and b/gradio/src/DiffRhythm.jpg differ diff --git a/gradio/src/negative_prompt.npy b/gradio/src/negative_prompt.npy new file mode 100644 index 0000000..ae5aa34 Binary files /dev/null and b/gradio/src/negative_prompt.npy differ diff --git a/gradio/src/prompt/classic_cn.wav b/gradio/src/prompt/classic_cn.wav new file mode 100644 index 0000000..77e4dc2 Binary files /dev/null and b/gradio/src/prompt/classic_cn.wav differ diff --git a/gradio/src/prompt/classic_en.wav b/gradio/src/prompt/classic_en.wav new file mode 100644 index 0000000..47d5623 Binary files /dev/null and b/gradio/src/prompt/classic_en.wav differ diff --git a/gradio/src/prompt/country_cn.wav b/gradio/src/prompt/country_cn.wav new file mode 100644 index 0000000..81935a2 Binary files /dev/null and b/gradio/src/prompt/country_cn.wav differ diff --git a/gradio/src/prompt/country_en.wav b/gradio/src/prompt/country_en.wav new file mode 100644 index 0000000..6e42b4b Binary files /dev/null and b/gradio/src/prompt/country_en.wav differ diff --git a/gradio/src/prompt/default.wav b/gradio/src/prompt/default.wav new file mode 100644 index 0000000..6b4a833 Binary files /dev/null and b/gradio/src/prompt/default.wav differ diff --git a/gradio/src/prompt/gift_of_the_world.wav b/gradio/src/prompt/gift_of_the_world.wav new file mode 100644 index 0000000..2cd5c03 Binary files /dev/null and b/gradio/src/prompt/gift_of_the_world.wav differ diff --git a/gradio/src/prompt/jazz_cn.wav b/gradio/src/prompt/jazz_cn.wav new file mode 100644 index 0000000..a5e9918 Binary files /dev/null and b/gradio/src/prompt/jazz_cn.wav differ diff --git a/gradio/src/prompt/jazz_en.wav b/gradio/src/prompt/jazz_en.wav new file mode 100644 index 0000000..ef32a68 Binary files /dev/null and b/gradio/src/prompt/jazz_en.wav differ diff --git a/gradio/src/prompt/little_happiness.wav b/gradio/src/prompt/little_happiness.wav new file mode 100644 index 0000000..d8432da Binary files /dev/null and b/gradio/src/prompt/little_happiness.wav differ diff --git a/gradio/src/prompt/little_talks.wav b/gradio/src/prompt/little_talks.wav new file mode 100644 index 0000000..56b7898 Binary files /dev/null and b/gradio/src/prompt/little_talks.wav differ diff --git a/gradio/src/prompt/most_beautiful_expectation.wav b/gradio/src/prompt/most_beautiful_expectation.wav new file mode 100644 index 0000000..5b18157 Binary files /dev/null and b/gradio/src/prompt/most_beautiful_expectation.wav differ diff --git a/gradio/src/prompt/pop_cn.wav b/gradio/src/prompt/pop_cn.wav new file mode 100644 index 0000000..252b6dc Binary files /dev/null and b/gradio/src/prompt/pop_cn.wav differ diff --git a/gradio/src/prompt/pop_en.wav b/gradio/src/prompt/pop_en.wav new file mode 100644 index 0000000..ed13ccc Binary files /dev/null and b/gradio/src/prompt/pop_en.wav differ diff --git a/gradio/src/prompt/rap_cn.wav b/gradio/src/prompt/rap_cn.wav new file mode 100644 index 0000000..954edb6 Binary files /dev/null and b/gradio/src/prompt/rap_cn.wav differ diff --git a/gradio/src/prompt/rap_en.wav b/gradio/src/prompt/rap_en.wav new file mode 100644 index 0000000..003fb6c Binary files /dev/null and b/gradio/src/prompt/rap_en.wav differ diff --git a/gradio/src/prompt/rock_cn.wav b/gradio/src/prompt/rock_cn.wav new file mode 100644 index 0000000..0631dc6 Binary files /dev/null and b/gradio/src/prompt/rock_cn.wav differ diff --git a/gradio/src/prompt/rock_en.wav b/gradio/src/prompt/rock_en.wav new file mode 100644 index 0000000..3440807 Binary files /dev/null and b/gradio/src/prompt/rock_en.wav differ diff --git a/infer/infer.py b/infer/infer.py index 9bc09ae..4a8306f 100755 --- a/infer/infer.py +++ b/infer/infer.py @@ -24,7 +24,7 @@ print("Current working directory:", os.getcwd()) -from infer_utils import ( +from .infer_utils import ( decode_audio, get_lrc_token, get_negative_style_prompt, diff --git a/infer/infer_utils.py b/infer/infer_utils.py index 4602f68..8bc7488 100755 --- a/infer/infer_utils.py +++ b/infer/infer_utils.py @@ -30,6 +30,7 @@ from model import DiT, CFM + def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128): downsampling_ratio = 2048 io_channels = 2 @@ -136,6 +137,41 @@ def get_negative_style_prompt(device): @torch.no_grad() + +def get_audio_style_prompt(model, wav_path): + vocal_flag = False + mulan = model + audio, _ = librosa.load(wav_path, sr=24000) + audio_len = librosa.get_duration(y=audio, sr=24000) + + if audio_len <= 1: + vocal_flag = True + + if audio_len > 10: + start_time = int(audio_len // 2 - 5) + wav = audio[start_time*24000:(start_time+10)*24000] + + else: + wav = audio + wav = torch.tensor(wav).unsqueeze(0).to(model.device) + + with torch.no_grad(): + audio_emb = mulan(wavs = wav) # [1, 512] + + audio_emb = audio_emb.half() + + return audio_emb, vocal_flag + +def get_text_style_prompt(model, text_prompt): + mulan = model + + with torch.no_grad(): + text_emb = mulan(texts = text_prompt) # [1, 512] + text_emb = text_emb.half() + + return text_emb + + def get_style_prompt(model, wav_path=None, prompt=None): mulan = model diff --git a/requirements.txt b/requirements.txt index 9264680..ba05241 100755 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,7 @@ onnxruntime Unidecode==1.3.8 phonemizer==3.3.0 inflect==7.5.0 +openai +spaces +py3langid +