Skip to content

Commit 70117e3

Browse files
authored
Merge pull request #13 from audiohacking/copilot/fix-cover-mode-detection
Fix cover/repaint generation failures: wrong audio dir, bad request JSON format, missing debug logs
2 parents bcf5978 + 841077a commit 70117e3

3 files changed

Lines changed: 167 additions & 53 deletions

File tree

server/src/config/index.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ const __dirname = path.dirname(__filename);
1313
// release: <bundle>/server/dist/config/ → ../../.. → <bundle>/
1414
const APP_ROOT = path.resolve(__dirname, '../../..');
1515

16+
// Server root = two levels above this file (always the server/ directory):
17+
// dev: <repo>/server/src/config/ → ../.. → <repo>/server/
18+
// release: <bundle>/server/dist/config/ → ../.. → <bundle>/server/
19+
const SERVER_ROOT = path.resolve(__dirname, '../..');
20+
1621
// ── Path helpers ─────────────────────────────────────────────────────────────
1722

1823
/**
@@ -219,7 +224,11 @@ export const config = {
219224

220225
storage: {
221226
provider: 'local' as const,
222-
audioDir: resolveFromRoot(process.env.AUDIO_DIR || path.join(APP_ROOT, 'public', 'audio')),
227+
// Audio directory must match where LocalStorageProvider writes files and
228+
// where Express serves /audio/ from (server/src/index.ts: '../public/audio').
229+
// Both resolve to <server_root>/public/audio, so we use SERVER_ROOT here.
230+
// AUDIO_DIR env override is still supported (resolved against APP_ROOT).
231+
audioDir: resolveFromRoot(process.env.AUDIO_DIR || path.join(SERVER_ROOT, 'public', 'audio')),
223232
},
224233

225234
jwt: {

server/src/routes/generate.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,11 +270,27 @@ router.post('/', authMiddleware, async (req: AuthenticatedRequest, res: Response
270270
return;
271271
}
272272

273-
if (customMode && !style && !lyrics && !referenceAudioUrl) {
273+
// In custom mode, at least one content field is required — unless the request
274+
// is for cover, audio2audio, or repaint mode and a source audio is provided
275+
// (the source audio itself is the primary input; style/lyrics are optional).
276+
const requiresSourceAudio = taskType === 'cover' || taskType === 'audio2audio' || taskType === 'repaint';
277+
if (customMode && !style && !lyrics && !referenceAudioUrl && !(requiresSourceAudio && sourceAudioUrl)) {
274278
res.status(400).json({ error: 'Style, lyrics, or reference audio required for custom mode' });
275279
return;
276280
}
277281

282+
// Debug log: show what the API client sent
283+
console.log(
284+
`[API] POST /generate:` +
285+
`\n taskType = ${taskType || 'text2music'}` +
286+
`\n customMode = ${customMode}` +
287+
`\n ditModel = ${ditModel || '(default)'}` +
288+
`\n sourceAudio = ${sourceAudioUrl || 'none'}` +
289+
`\n repaint = [${repaintingStart ?? 'start'}, ${repaintingEnd ?? 'end'}]` +
290+
`\n coverStr = ${audioCoverStrength ?? 'n/a'}` +
291+
`\n user = ${req.user!.id}`
292+
);
293+
278294
const params = {
279295
customMode,
280296
songDescription,

server/src/services/acestep.ts

Lines changed: 140 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -220,18 +220,37 @@ function resolveParamDitModel(name: string | undefined): string {
220220
// Audio path resolution (for reference/source audio inputs)
221221
// ---------------------------------------------------------------------------
222222

223+
/**
224+
* Resolves a UI audio URL (e.g. "/audio/reference-tracks/user/file.mp3") or
225+
* an absolute filesystem path to the local filesystem path that the spawned
226+
* binary can open.
227+
*
228+
* Supported input formats:
229+
* • "/audio/<rest>" — relative public URL; joined with AUDIO_DIR
230+
* (covers reference-tracks/, generated songs, etc.)
231+
* • "http[s]://host/audio/…" — absolute URL whose path starts with /audio/
232+
* • Any other absolute path — returned as-is
233+
*/
223234
function resolveAudioPath(audioUrl: string): string {
235+
// Relative public URL produced by the UI player or upload endpoint
224236
if (audioUrl.startsWith('/audio/')) {
225-
return path.join(AUDIO_DIR, audioUrl.replace('/audio/', ''));
237+
const resolved = path.join(AUDIO_DIR, audioUrl.slice('/audio/'.length));
238+
console.log(`[resolveAudio] ${audioUrl}${resolved}`);
239+
return resolved;
226240
}
227-
if (audioUrl.startsWith('http')) {
241+
// Full HTTP URL — extract the path component and try again
242+
if (audioUrl.startsWith('http://') || audioUrl.startsWith('https://')) {
228243
try {
229244
const parsed = new URL(audioUrl);
230245
if (parsed.pathname.startsWith('/audio/')) {
231-
return path.join(AUDIO_DIR, parsed.pathname.replace('/audio/', ''));
246+
const resolved = path.join(AUDIO_DIR, parsed.pathname.slice('/audio/'.length));
247+
console.log(`[resolveAudio] ${audioUrl}${resolved}`);
248+
return resolved;
232249
}
233250
} catch { /* fall through */ }
234251
}
252+
// Already an absolute filesystem path — pass through
253+
console.log(`[resolveAudio] ${audioUrl} → (absolute path, no change)`);
235254
return audioUrl;
236255
}
237256

@@ -446,61 +465,105 @@ async function runViaSpawn(
446465
const tmpDir = path.join(AUDIO_DIR, `_tmp_${jobId}`);
447466
await mkdir(tmpDir, { recursive: true });
448467

468+
// ── Determine generation mode ────────────────────────────────────────────
469+
// Explicit task type drives mode selection; source audio / audio codes act
470+
// as secondary signals for backward compatibility.
471+
const taskType = params.taskType || 'text2music';
472+
const isCover = taskType === 'cover' || taskType === 'audio2audio';
473+
const isRepaint = taskType === 'repaint';
474+
// Passthrough: taskType explicitly set, or audio codes provided without
475+
// a source audio file (legacy callers that omit the taskType field).
476+
const isPassthru = taskType === 'passthrough' || Boolean(params.audioCodes && !params.sourceAudioUrl);
477+
// LLM (ace-qwen3) is only needed for plain text-to-music generation.
478+
// Cover, repaint, and passthrough all skip it.
479+
const skipLm = isCover || isRepaint || isPassthru;
480+
481+
// ── Debug: log what the UI/API client requested ──────────────────────────
482+
console.log(
483+
`[Job ${jobId}] Request received:` +
484+
`\n mode = ${taskType}` +
485+
`\n customMode = ${params.customMode}` +
486+
`\n ditModel = ${params.ditModel || '(default)'}` +
487+
`\n sourceAudio = ${params.sourceAudioUrl || 'none'}` +
488+
`\n repaintRegion = [${params.repaintingStart ?? 'start'}, ${params.repaintingEnd ?? 'end'}]` +
489+
`\n coverStrength = ${params.audioCoverStrength ?? 'n/a'}` +
490+
`\n steps = ${params.inferenceSteps ?? 8}` +
491+
`\n guidance = ${params.guidanceScale ?? 0.0}` +
492+
`\n shift = ${params.shift ?? 3.0}` +
493+
`\n skipLm = ${skipLm}`
494+
);
495+
449496
try {
450497
// ── Build request.json ─────────────────────────────────────────────────
451-
// ace-qwen3 reads generation parameters from a JSON file. Only `caption`
452-
// is strictly required; all other fields default to sensible values.
498+
// The JSON file is read by ace-qwen3 (text2music) or dit-vae directly
499+
// (cover / repaint / passthrough). Only include the fields each binary
500+
// actually understands so the format stays clean and predictable.
453501
const caption = params.style || 'pop music';
454502
const prompt = params.customMode ? caption : (params.songDescription || caption);
455-
// Instrumental: pass the special "[Instrumental]" lyrics string so the LLM
503+
// Instrumental: pass the special "[Instrumental]" lyrics marker so the LLM
456504
// skips lyrics generation (as documented in the acestep.cpp README).
457505
const lyrics = params.instrumental ? '[Instrumental]' : (params.lyrics || '');
458506

507+
// Fields common to all modes (understood by both ace-qwen3 and dit-vae)
459508
const requestJson: Record<string, unknown> = {
460-
caption: prompt,
509+
caption: prompt,
461510
lyrics,
462-
vocal_language: params.vocalLanguage || 'unknown',
463-
seed: params.randomSeed !== false ? -1 : (params.seed ?? -1),
464-
lm_temperature: params.lmTemperature ?? 0.85,
465-
lm_cfg_scale: params.lmCfgScale ?? 2.0,
466-
lm_top_p: params.lmTopP ?? 0.9,
467-
lm_top_k: params.lmTopK ?? 0,
468-
lm_negative_prompt: params.lmNegativePrompt || '',
469-
inference_steps: params.inferenceSteps ?? 8,
470-
guidance_scale: params.guidanceScale ?? 0.0,
471-
shift: params.shift ?? 3.0,
511+
seed: params.randomSeed !== false ? -1 : (params.seed ?? -1),
512+
inference_steps: params.inferenceSteps ?? 8,
513+
guidance_scale: params.guidanceScale ?? 0.0,
514+
shift: params.shift ?? 3.0,
472515
};
473-
// Optional metadata (0 / empty = let the LLM fill it)
474-
if (params.bpm && params.bpm > 0) requestJson.bpm = params.bpm;
475-
if (params.duration && params.duration > 0) requestJson.duration = params.duration;
476-
if (params.keyScale) requestJson.keyscale = params.keyScale;
477-
if (params.timeSignature) requestJson.timesignature = params.timeSignature;
478-
// Passthrough: skip the LLM when audio codes are already provided
479-
if (params.audioCodes) requestJson.audio_codes = params.audioCodes;
480-
// Cover/audio-to-audio: strength of the source audio influence on the output
481-
// (ignored in repaint mode — the mask handles everything)
482-
if (params.audioCoverStrength !== undefined && params.taskType !== 'repaint') {
483-
requestJson.audio_cover_strength = params.audioCoverStrength;
484-
}
485-
// Repaint mode: regenerate a time region while preserving the rest.
486-
// Activated by setting repainting_start and/or repainting_end in the JSON.
487-
// Both default to -1 (inactive): -1 on start means 0s, -1 on end means source duration.
488-
if (params.taskType === 'repaint' && params.sourceAudioUrl) {
489-
requestJson.repainting_start = params.repaintingStart ?? -1;
490-
requestJson.repainting_end = params.repaintingEnd ?? -1;
516+
517+
// Optional music metadata (0 / empty → binary fills it in)
518+
if (params.bpm && params.bpm > 0) requestJson.bpm = params.bpm;
519+
if (params.duration && params.duration > 0) requestJson.duration = params.duration;
520+
if (params.keyScale) requestJson.keyscale = params.keyScale;
521+
if (params.timeSignature) requestJson.timesignature = params.timeSignature;
522+
523+
if (skipLm) {
524+
// ── Cover / repaint / passthrough: ace-qwen3 is skipped ─────────────
525+
// Add only the mode-specific fields that dit-vae cares about.
526+
if (isPassthru) {
527+
if (!params.audioCodes) {
528+
// Passthrough requires pre-computed codes — fail early with a clear message
529+
throw new Error("task_type='passthrough' requires pre-computed audio_codes");
530+
}
531+
requestJson.audio_codes = params.audioCodes;
532+
} else if (isCover) {
533+
// Cover / audio-to-audio: strength of the source audio influence (0–1)
534+
if (params.audioCoverStrength !== undefined) {
535+
requestJson.audio_cover_strength = params.audioCoverStrength;
536+
}
537+
} else if (isRepaint) {
538+
// Repaint: regenerate only the specified time region; preserve the rest.
539+
// Both default to -1: start=-1 → 0 s, end=-1 → full source duration.
540+
// Note: sourceAudioUrl is guaranteed here — validated in processGeneration.
541+
requestJson.repainting_start = params.repaintingStart ?? -1;
542+
requestJson.repainting_end = params.repaintingEnd ?? -1;
543+
}
544+
} else {
545+
// ── Text-to-music: include LM parameters for ace-qwen3 ──────────────
546+
requestJson.vocal_language = params.vocalLanguage || 'unknown';
547+
requestJson.lm_temperature = params.lmTemperature ?? 0.85;
548+
requestJson.lm_cfg_scale = params.lmCfgScale ?? 2.0;
549+
requestJson.lm_top_p = params.lmTopP ?? 0.9;
550+
requestJson.lm_top_k = params.lmTopK ?? 0;
551+
requestJson.lm_negative_prompt = params.lmNegativePrompt || '';
491552
}
492553

493554
const requestPath = path.join(tmpDir, 'request.json');
494555
await writeFile(requestPath, JSON.stringify(requestJson, null, 2));
556+
console.log(`[Job ${jobId}] Request JSON written to ${requestPath}:`);
557+
console.log(JSON.stringify(requestJson, null, 2));
495558

496559
// ── Step 1: ace-qwen3 — LLM (lyrics + audio codes) ────────────────────
497560
// Skipped when:
498-
// • audio_codes are provided (passthrough) — codes are already known
499-
// • sourceAudioUrl is provided (cover/audio-to-audio) — dit-vae derives
500-
// codes directly from the source audio; running ace-qwen3 is not needed
561+
// • taskType is cover / audio2audio / repaint — dit-vae derives tokens
562+
// directly from the source audio; running ace-qwen3 is not needed
563+
// • taskType is passthrough — audio codes are already provided
501564
let enrichedPaths: string[] = [];
502565

503-
if (!params.audioCodes && !params.sourceAudioUrl) {
566+
if (!skipLm) {
504567
job.stage = 'LLM: generating lyrics and audio codes…';
505568

506569
const lmBin = config.acestep.lmBin!;
@@ -513,7 +576,7 @@ async function runViaSpawn(
513576
if (batchSize > 1) lmArgs.push('--batch', String(batchSize));
514577
lmArgs.push(...parseExtraArgs(process.env.ACE_QWEN3_EXTRA_ARGS));
515578

516-
console.log(`[Spawn] Job ${jobId}: ace-qwen3 ${lmArgs.slice(0, 6).join(' ')}`);
579+
console.log(`[Job ${jobId}] Running ace-qwen3:\n ${lmBin} ${lmArgs.join(' ')}`);
517580
await runBinary(lmBin, lmArgs, 'ace-qwen3', undefined, makeLmProgressHandler(job));
518581

519582
// Collect enriched JSON files produced by ace-qwen3:
@@ -528,24 +591,33 @@ async function runViaSpawn(
528591
if (enrichedPaths.length === 0) {
529592
throw new Error('ace-qwen3 produced no enriched request files');
530593
}
594+
console.log(`[Job ${jobId}] ace-qwen3 produced ${enrichedPaths.length} enriched file(s): ${enrichedPaths.join(', ')}`);
531595
} else {
532-
// Passthrough: use the original request.json directly
533-
// (audio codes provided, or source audio supplied for cover/audio-to-audio mode)
596+
// Cover / repaint / passthrough: pass the original request.json directly
597+
// to dit-vae; no LLM enrichment step needed.
534598
enrichedPaths = [requestPath];
599+
console.log(`[Job ${jobId}] LLM step skipped (mode=${taskType}); passing request.json directly to dit-vae`);
535600
}
536601

537602
// ── Step 2: dit-vae — DiT + VAE (audio synthesis) ──────────────────────
538603
job.stage = 'DiT+VAE: synthesising audio…';
539604

540-
const ditVaeBin = config.acestep.ditVaeBin!;
541-
const textEncoderModel = config.acestep.textEncoderModel;
542-
const ditModel = resolveParamDitModel(params.ditModel);
543-
const vaeModel = config.acestep.vaeModel;
605+
const ditVaeBin = config.acestep.ditVaeBin!;
606+
const textEncoderModel = config.acestep.textEncoderModel;
607+
const ditModel = resolveParamDitModel(params.ditModel);
608+
const vaeModel = config.acestep.vaeModel;
544609

545610
if (!textEncoderModel) throw new Error('Text-encoder model not found — run models.sh first');
546611
if (!ditModel) throw new Error('DiT model not found — run models.sh first');
547612
if (!vaeModel) throw new Error('VAE model not found — run models.sh first');
548613

614+
console.log(
615+
`[Job ${jobId}] Resolved model paths:` +
616+
`\n text-encoder = ${textEncoderModel}` +
617+
`\n dit = ${ditModel}` +
618+
`\n vae = ${vaeModel}`
619+
);
620+
549621
const ditArgs: string[] = [
550622
'--request', ...enrichedPaths,
551623
'--text-encoder', textEncoderModel,
@@ -556,10 +628,14 @@ async function runViaSpawn(
556628
const batchSize = Math.min(Math.max(params.batchSize ?? 1, 1), 8);
557629
if (batchSize > 1) ditArgs.push('--batch', String(batchSize));
558630

559-
if (params.sourceAudioUrl) ditArgs.push('--src-audio', resolveAudioPath(params.sourceAudioUrl));
631+
// Cover and repaint modes both require a source audio file
632+
if (params.sourceAudioUrl) {
633+
const srcAudioPath = resolveAudioPath(params.sourceAudioUrl);
634+
ditArgs.push('--src-audio', srcAudioPath);
635+
}
560636
ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS));
561637

562-
console.log(`[Spawn] Job ${jobId}: dit-vae ${ditArgs.slice(0, 6).join(' ')}`);
638+
console.log(`[Job ${jobId}] Running dit-vae:\n ${ditVaeBin} ${ditArgs.join(' ')}`);
563639
await runBinary(ditVaeBin, ditArgs, 'dit-vae', undefined, makeDitVaeProgressHandler(job));
564640

565641
// ── Collect generated WAV files ─────────────────────────────────────────
@@ -608,7 +684,7 @@ async function runViaSpawn(
608684
status: 'succeeded',
609685
};
610686
job.rawResponse = enrichedMeta;
611-
console.log(`[Spawn] Job ${jobId}: completed with ${audioUrls.length} audio file(s)`);
687+
console.log(`[Job ${jobId}] Completed successfully with ${audioUrls.length} audio file(s): ${audioUrls.join(', ')}`);
612688

613689
// Clean up tmp directory
614690
await rm(tmpDir, { recursive: true, force: true }).catch(() => { /* best-effort */ });
@@ -843,16 +919,28 @@ async function processGeneration(
843919
job.status = 'running';
844920
job.stage = 'Starting generation...';
845921

922+
const mode = useSpawnMode(params) ? 'spawn' : 'http';
923+
console.log(
924+
`[Job ${jobId}] Starting generation (${mode} mode):` +
925+
`\n taskType = ${params.taskType || 'text2music'}` +
926+
`\n customMode = ${params.customMode}` +
927+
`\n ditModel = ${params.ditModel || '(default)'}` +
928+
`\n sourceAudio = ${params.sourceAudioUrl || 'none'}` +
929+
`\n audioCodes = ${params.audioCodes ? '[provided]' : 'none'}`
930+
);
931+
846932
if ((params.taskType === 'cover' || params.taskType === 'audio2audio') &&
847933
!params.sourceAudioUrl && !params.audioCodes) {
848934
job.status = 'failed';
849935
job.error = `task_type='${params.taskType}' requires a source audio or audio codes`;
936+
console.error(`[Job ${jobId}] Validation failed: ${job.error}`);
850937
return;
851938
}
852939

853940
if (params.taskType === 'repaint' && !params.sourceAudioUrl) {
854941
job.status = 'failed';
855942
job.error = "task_type='repaint' requires a source audio (--src-audio)";
943+
console.error(`[Job ${jobId}] Validation failed: ${job.error}`);
856944
return;
857945
}
858946

@@ -864,9 +952,10 @@ async function processGeneration(
864952
await runViaHttp(jobId, params, job);
865953
}
866954
} catch (err) {
867-
console.error(`Job ${jobId} failed:`, err);
955+
const errMsg = err instanceof Error ? err.message : String(err);
956+
console.error(`[Job ${jobId}] Generation failed: ${errMsg}`);
868957
job.status = 'failed';
869-
job.error = err instanceof Error ? err.message : 'Generation failed';
958+
job.error = errMsg || 'Generation failed';
870959
}
871960
}
872961

0 commit comments

Comments
 (0)