= ({
onClick={() => onNavigate('models')}
isExpanded={isOpen}
/>
+ }
+ label={t('debug')}
+ active={currentView === 'debug'}
+ onClick={() => onNavigate('debug')}
+ isExpanded={isOpen}
+ />
{/* Theme Toggle */}
diff --git a/components/SongList.tsx b/components/SongList.tsx
index 807fadd..2bfba37 100644
--- a/components/SongList.tsx
+++ b/components/SongList.tsx
@@ -641,6 +641,16 @@ const SongItem: React.FC = ({
{getModelDisplayName(song.ditModel)}
+ {song.audioUrl && !song.isGenerating && (() => {
+ const ext = song.audioUrl.split('.').pop()?.toLowerCase();
+ const fmtLabel = ext === 'mp3' ? 'MP3' : ext === 'wav' ? 'WAV' : null;
+ const fmtClass = ext === 'mp3' ? 'from-orange-500 to-amber-500' : 'from-sky-500 to-blue-600';
+ return fmtLabel ? (
+
+ {fmtLabel}
+
+ ) : null;
+ })()}
{song.isPublic === false && (
)}
diff --git a/i18n/translations.ts b/i18n/translations.ts
index 717471c..dd2a38b 100644
--- a/i18n/translations.ts
+++ b/i18n/translations.ts
@@ -8,6 +8,7 @@ export const translations = {
search: 'Search',
models: 'Models',
news: 'News',
+ debug: 'Debug',
// Theme
lightMode: 'Light Mode',
@@ -400,6 +401,25 @@ export const translations = {
sftModelNotFound: 'SFT model not found — go to Models to download it',
sftModelSwitched: 'Switched to SFT model for repaint',
autoSwitchedToSft: 'Auto-switched to SFT model (required for repaint)',
+
+ // Lego mode
+ legoMode: 'Lego',
+ legoModeDescription: 'Generate a new instrument track layered over the backing track',
+ legoTrackLabel: 'Instrument Track',
+ legoTrackPlaceholder: 'Select instrument…',
+ legoBaseModelRequired: 'Lego mode requires the base model (acestep-v15-base)',
+ autoSwitchedToBase: 'Auto-switched to base model (required for lego)',
+
+ // Understand
+ understand: 'Understand',
+ understandTooltip: 'Analyse this audio with AI to extract caption, lyrics, BPM and more',
+ understandApply: 'Apply to form',
+ understandApplyCaption: 'Apply caption',
+ understandApplyLyrics: 'Apply lyrics',
+ understandResult: 'Understand Result',
+ understandRunning: 'Analysing audio…',
+ understandError: 'Analysis failed',
+ understandNotAvailable: 'ace-understand binary not found',
// Search Page
searchSongsPlaceholder: 'Search for songs, playlists, creators, or genres',
@@ -547,6 +567,7 @@ export const translations = {
search: '搜索',
models: '模型',
news: '新闻',
+ debug: '调试',
// Theme
lightMode: '浅色模式',
@@ -939,6 +960,25 @@ export const translations = {
sftModelNotFound: '未找到 SFT 模型 — 前往模型页面下载',
sftModelSwitched: '已切换到 SFT 模型用于重绘',
autoSwitchedToSft: '已自动切换到 SFT 模型(重绘所需)',
+
+ // Lego mode
+ legoMode: 'Lego',
+ legoModeDescription: '在伴奏音轨上生成新的乐器轨道',
+ legoTrackLabel: '乐器轨道',
+ legoTrackPlaceholder: '选择乐器…',
+ legoBaseModelRequired: 'Lego 模式需要基础模型(acestep-v15-base)',
+ autoSwitchedToBase: '已自动切换到基础模型(Lego 模式所需)',
+
+ // Understand
+ understand: '分析',
+ understandTooltip: '用 AI 分析此音频,提取标题、歌词、BPM 等信息',
+ understandApply: '应用到表单',
+ understandApplyCaption: '应用标题',
+ understandApplyLyrics: '应用歌词',
+ understandResult: '分析结果',
+ understandRunning: '正在分析音频…',
+ understandError: '分析失败',
+ understandNotAvailable: '未找到 ace-understand 程序',
// Search Page
searchSongsPlaceholder: '搜索歌曲、播放列表、创作者或风格',
@@ -1086,6 +1126,7 @@ export const translations = {
search: '検索',
models: 'モデル',
news: 'ニュース',
+ debug: 'デバッグ',
// Theme
lightMode: 'ライトモード',
@@ -1478,6 +1519,25 @@ export const translations = {
sftModelNotFound: 'SFT モデルが見つかりません — モデルページでダウンロード',
sftModelSwitched: 'リペイント用に SFT モデルに切り替えました',
autoSwitchedToSft: 'SFT モデルに自動切り替え(リペイントに必要)',
+
+ // Lego mode
+ legoMode: 'Lego',
+ legoModeDescription: 'バッキングトラックに新しい楽器トラックを重ねて生成',
+ legoTrackLabel: '楽器トラック',
+ legoTrackPlaceholder: '楽器を選択…',
+ legoBaseModelRequired: 'Lego モードにはベースモデル(acestep-v15-base)が必要',
+ autoSwitchedToBase: 'ベースモデルに自動切り替え(Lego に必要)',
+
+ // Understand
+ understand: '解析',
+ understandTooltip: 'AI でこの音声を解析し、キャプション、歌詞、BPM などを抽出',
+ understandApply: 'フォームに適用',
+ understandApplyCaption: 'キャプションを適用',
+ understandApplyLyrics: '歌詞を適用',
+ understandResult: '解析結果',
+ understandRunning: '音声を解析中…',
+ understandError: '解析に失敗しました',
+ understandNotAvailable: 'ace-understand バイナリが見つかりません',
// Search Page
searchSongsPlaceholder: '曲、プレイリスト、クリエイター、スタイルを検索',
@@ -1625,6 +1685,7 @@ export const translations = {
search: '검색',
models: '모델',
news: '뉴스',
+ debug: '디버그',
// Theme
lightMode: '라이트 모드',
@@ -2017,6 +2078,25 @@ export const translations = {
sftModelNotFound: 'SFT 모델을 찾을 수 없습니다 — 모델 페이지에서 다운로드하세요',
sftModelSwitched: '리페인트를 위해 SFT 모델로 전환했습니다',
autoSwitchedToSft: 'SFT 모델로 자동 전환됨 (리페인트에 필요)',
+
+ // Lego mode
+ legoMode: 'Lego',
+ legoModeDescription: '기존 반주 트랙 위에 새로운 악기 트랙 생성',
+ legoTrackLabel: '악기 트랙',
+ legoTrackPlaceholder: '악기 선택…',
+ legoBaseModelRequired: 'Lego 모드에는 기본 모델(acestep-v15-base)이 필요합니다',
+ autoSwitchedToBase: '기본 모델로 자동 전환됨 (Lego에 필요)',
+
+ // Understand
+ understand: '분석',
+ understandTooltip: 'AI로 이 오디오를 분석하여 캡션, 가사, BPM 등을 추출',
+ understandApply: '양식에 적용',
+ understandApplyCaption: '캡션 적용',
+ understandApplyLyrics: '가사 적용',
+ understandResult: '분석 결과',
+ understandRunning: '오디오 분석 중…',
+ understandError: '분석 실패',
+ understandNotAvailable: 'ace-understand 바이너리를 찾을 수 없습니다',
// Search Page
searchSongsPlaceholder: '곡, 재생목록, 제작자 또는 스타일 검색',
diff --git a/server/package-lock.json b/server/package-lock.json
index e252cb4..a9087f3 100644
--- a/server/package-lock.json
+++ b/server/package-lock.json
@@ -13,6 +13,7 @@
"cors": "^2.8.5",
"dotenv": "^16.3.1",
"express": "^4.18.2",
+ "express-rate-limit": "^8.3.1",
"helmet": "^8.1.0",
"jsonwebtoken": "^9.0.2",
"multer": "^2.0.2",
@@ -1160,6 +1161,24 @@
"url": "https://opencollective.com/express"
}
},
+ "node_modules/express-rate-limit": {
+ "version": "8.3.1",
+ "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.3.1.tgz",
+ "integrity": "sha512-D1dKN+cmyPWuvB+G2SREQDzPY1agpBIcTa9sJxOPMCNeH3gwzhqJRDWCXW3gg0y//+LQ/8j52JbMROWyrKdMdw==",
+ "license": "MIT",
+ "dependencies": {
+ "ip-address": "10.1.0"
+ },
+ "engines": {
+ "node": ">= 16"
+ },
+ "funding": {
+ "url": "https://github.com/sponsors/express-rate-limit"
+ },
+ "peerDependencies": {
+ "express": ">= 4.11"
+ }
+ },
"node_modules/file-uri-to-path": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz",
@@ -1397,6 +1416,15 @@
"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
"license": "ISC"
},
+ "node_modules/ip-address": {
+ "version": "10.1.0",
+ "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
+ "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 12"
+ }
+ },
"node_modules/ipaddr.js": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
diff --git a/server/package.json b/server/package.json
index 027abad..e41890a 100644
--- a/server/package.json
+++ b/server/package.json
@@ -15,6 +15,7 @@
"cors": "^2.8.5",
"dotenv": "^16.3.1",
"express": "^4.18.2",
+ "express-rate-limit": "^8.3.1",
"helmet": "^8.1.0",
"jsonwebtoken": "^9.0.2",
"multer": "^2.0.2",
diff --git a/server/src/config/index.ts b/server/src/config/index.ts
index 7abb41c..3155977 100644
--- a/server/src/config/index.ts
+++ b/server/src/config/index.ts
@@ -51,6 +51,16 @@ function resolveDitVaeBin(): string {
return '';
}
+/** Resolves the ace-understand binary path (reverse pipeline: audio → metadata). */
+function resolveUnderstandBin(): string {
+ if (process.env.ACE_UNDERSTAND_BIN) return resolveFromRoot(process.env.ACE_UNDERSTAND_BIN);
+ for (const name of ['ace-understand', 'ace-understand.exe']) {
+ const p = path.join(APP_ROOT, 'bin', name);
+ if (existsSync(p)) return p;
+ }
+ return '';
+}
+
// ── Model resolution ─────────────────────────────────────────────────────────
/** Resolves the models directory. */
@@ -91,6 +101,40 @@ function resolveDitModel(modelsDir: string): string {
return '';
}
+/**
+ * Resolves the base DiT model (acestep-v15-base-*.gguf).
+ * The base model is mandatory for lego mode — the turbo/sft variants will not work.
+ * Override via ACESTEP_BASE_MODEL in .env.
+ */
+function resolveBaseModel(modelsDir: string): string {
+ if (process.env.ACESTEP_BASE_MODEL) {
+ const p = resolveFromRoot(process.env.ACESTEP_BASE_MODEL);
+ if (existsSync(p)) return p;
+ console.warn(`[config] ACESTEP_BASE_MODEL path not found: ${p} — falling back to auto-detection`);
+ }
+ if (!existsSync(modelsDir)) return '';
+
+ const preference = [
+ 'acestep-v15-base-Q8_0.gguf',
+ 'acestep-v15-base-Q6_K.gguf',
+ 'acestep-v15-base-Q5_K_M.gguf',
+ 'acestep-v15-base-Q4_K_M.gguf',
+ 'acestep-v15-base-BF16.gguf',
+ ];
+ for (const name of preference) {
+ const p = path.join(modelsDir, name);
+ if (existsSync(p)) return p;
+ }
+
+ try {
+ const files = readdirSync(modelsDir).filter(f => f.endsWith('.gguf') && !f.endsWith('.part'));
+ const base = files.find(f => f.startsWith('acestep-v15-base'));
+ if (base) return path.join(modelsDir, base);
+ } catch { /* ignore read errors */ }
+
+ return '';
+}
+
/** Resolves the causal LM model (acestep-5Hz-lm-*.gguf). */
function resolveLmModel(modelsDir: string): string {
if (process.env.LM_MODEL) return resolveFromRoot(process.env.LM_MODEL);
@@ -167,7 +211,9 @@ function resolveVaeModel(modelsDir: string): string {
const modelsDir = resolveModelsDir();
const resolvedLmBin = resolveLmBin();
const resolvedDitVaeBin = resolveDitVaeBin();
+const resolvedUnderstandBin = resolveUnderstandBin();
const resolvedDitModel = resolveDitModel(modelsDir);
+const resolvedBaseModel = resolveBaseModel(modelsDir);
const resolvedLmModel = resolveLmModel(modelsDir);
const resolvedTextEncoderModel = resolveTextEncoderModel(modelsDir);
const resolvedVaeModel = resolveVaeModel(modelsDir);
@@ -177,12 +223,16 @@ if (resolvedLmBin) console.log(`[config] ace-qwen3: ${resolvedL
else console.log('[config] ace-qwen3: not found (set ACE_QWEN3_BIN)');
if (resolvedDitVaeBin) console.log(`[config] dit-vae: ${resolvedDitVaeBin}`);
else console.log('[config] dit-vae: not found (set DIT_VAE_BIN)');
+if (resolvedUnderstandBin) console.log(`[config] ace-understand: ${resolvedUnderstandBin}`);
+else console.log('[config] ace-understand: not found (set ACE_UNDERSTAND_BIN)');
if (resolvedLmModel) console.log(`[config] LM model: ${resolvedLmModel}`);
else console.log('[config] LM model: none (run models.sh)');
if (resolvedTextEncoderModel) console.log(`[config] text encoder: ${resolvedTextEncoderModel}`);
else console.log('[config] text encoder: none (run models.sh)');
if (resolvedDitModel) console.log(`[config] DiT model: ${resolvedDitModel}`);
else console.log('[config] DiT model: none (run models.sh)');
+if (resolvedBaseModel) console.log(`[config] base DiT model: ${resolvedBaseModel}`);
+else console.log('[config] base DiT model: none (download acestep-v15-base for lego mode)');
if (resolvedVaeModel) console.log(`[config] VAE model: ${resolvedVaeModel}`);
else console.log('[config] VAE model: none (run models.sh)');
@@ -203,9 +253,12 @@ export const config = {
// Two-binary spawn mode (acestep.cpp native pipeline)
lmBin: resolvedLmBin,
ditVaeBin: resolvedDitVaeBin,
+ understandBin: resolvedUnderstandBin,
lmModel: resolvedLmModel,
textEncoderModel: resolvedTextEncoderModel,
ditModel: resolvedDitModel,
+ // Base DiT model — required for lego mode (turbo/sft will not work)
+ baseModel: resolvedBaseModel,
vaeModel: resolvedVaeModel,
// HTTP fallback mode
diff --git a/server/src/routes/generate.ts b/server/src/routes/generate.ts
index 5402cb1..47c07fd 100644
--- a/server/src/routes/generate.ts
+++ b/server/src/routes/generate.ts
@@ -2,6 +2,7 @@ import { Router, Response } from 'express';
import multer from 'multer';
import path from 'path';
import { spawn } from 'child_process';
+import rateLimit from 'express-rate-limit';
import { pool } from '../db/pool.js';
import { generateUUID } from '../db/sqlite.js';
import { config } from '../config/index.js';
@@ -14,16 +15,43 @@ import {
checkSpaceHealth,
cleanupJob,
getJobRawResponse,
- downloadAudioToBuffer,
+ getJobLogs,
+ listActiveJobs,
} from '../services/acestep.js';
import { getStorageProvider } from '../services/storage/factory.js';
+// Rate limiter for the debug log polling endpoints (read-only, lightweight)
+const logRateLimiter = rateLimit({
+ windowMs: 60_000,
+ max: 120, // 2 req/s sustained — enough for 1.5s poll intervals
+ standardHeaders: true,
+ legacyHeaders: false,
+ message: { error: 'Too many log requests — please slow down polling' },
+});
+
+// Rate limiter for the job status polling endpoint (performs FS operations on first completion)
+const statusRateLimiter = rateLimit({
+ windowMs: 60_000,
+ max: 120, // 2 req/s sustained — enough for 2s frontend poll intervals
+ standardHeaders: true,
+ legacyHeaders: false,
+ message: { error: 'Too many status requests — please slow down polling' },
+});
+
const router = Router();
// Auto-generate a song title from lyrics or style when none is provided
-function autoTitle(params: { title?: string; lyrics?: string; instrumental?: boolean; style?: string; songDescription?: string }): string {
+function autoTitle(params: { title?: string; lyrics?: string; instrumental?: boolean; style?: string; songDescription?: string; taskType?: string; trackName?: string; sourceAudioTitle?: string }): string {
if (params.title?.trim()) return params.title.trim();
+ // For lego mode: combine source audio name + instrument to make a descriptive title
+ if (params.taskType === 'lego' && params.trackName) {
+ const base = params.sourceAudioTitle
+ ? params.sourceAudioTitle.replace(/\.[^.]+$/, '').replace(/[_-]+/g, ' ').trim()
+ : 'track';
+ return `${base} — ${params.trackName}`;
+ }
+
// Try first meaningful lyric line (skip section markers like [verse], [chorus])
if (!params.instrumental && params.lyrics) {
for (const line of params.lyrics.split('\n')) {
@@ -78,8 +106,8 @@ const audioUpload = multer({
});
interface GenerateBody {
- // Mode
- customMode: boolean;
+ // Mode (kept for backward compatibility; unified mode always uses full-featured panel)
+ customMode?: boolean;
// Simple Mode
songDescription?: string;
@@ -106,7 +134,7 @@ interface GenerateBody {
randomSeed?: boolean;
seed?: number;
thinking?: boolean;
- audioFormat?: 'mp3' | 'flac';
+ audioFormat?: 'mp3' | 'wav';
inferMethod?: 'ode' | 'sde';
shift?: number;
@@ -265,17 +293,11 @@ router.post('/', authMiddleware, async (req: AuthenticatedRequest, res: Response
ditModel,
} = req.body as GenerateBody;
- if (!customMode && !songDescription) {
- res.status(400).json({ error: 'Song description required for simple mode' });
- return;
- }
-
- // In custom mode, at least one content field is required — unless the request
- // is for cover, audio2audio, or repaint mode and a source audio is provided
- // (the source audio itself is the primary input; style/lyrics are optional).
- const requiresSourceAudio = taskType === 'cover' || taskType === 'audio2audio' || taskType === 'repaint';
- if (customMode && !style && !lyrics && !referenceAudioUrl && !(requiresSourceAudio && sourceAudioUrl)) {
- res.status(400).json({ error: 'Style, lyrics, or reference audio required for custom mode' });
+ // At least one content field is required — unless the request is for cover/repaint/lego
+ // and a source audio is provided (the source audio itself is the primary input).
+ const requiresSourceAudio = taskType === 'cover' || taskType === 'audio2audio' || taskType === 'repaint' || taskType === 'lego';
+ if (!songDescription && !style && !lyrics && !referenceAudioUrl && !(requiresSourceAudio && sourceAudioUrl)) {
+ res.status(400).json({ error: 'Please provide a description, style, lyrics, or audio' });
return;
}
@@ -283,7 +305,6 @@ router.post('/', authMiddleware, async (req: AuthenticatedRequest, res: Response
console.log(
`[API] POST /generate:` +
`\n taskType = ${taskType || 'text2music'}` +
- `\n customMode = ${customMode}` +
`\n ditModel = ${ditModel || '(default)'}` +
`\n sourceAudio = ${sourceAudioUrl || 'none'}` +
`\n repaint = [${repaintingStart ?? 'start'}, ${repaintingEnd ?? 'end'}]` +
@@ -292,7 +313,7 @@ router.post('/', authMiddleware, async (req: AuthenticatedRequest, res: Response
);
const params = {
- customMode,
+ customMode: true,
songDescription,
lyrics,
style,
@@ -377,7 +398,7 @@ router.post('/', authMiddleware, async (req: AuthenticatedRequest, res: Response
}
});
-router.get('/status/:jobId', authMiddleware, async (req: AuthenticatedRequest, res: Response) => {
+router.get('/status/:jobId', statusRateLimiter, authMiddleware, async (req: AuthenticatedRequest, res: Response) => {
try {
const jobResult = await pool.query(
`SELECT id, user_id, acestep_task_id, status, params, result, error, created_at
@@ -440,10 +461,18 @@ router.get('/status/:jobId', authMiddleware, async (req: AuthenticatedRequest, r
const songId = generateUUID();
try {
- const { buffer } = await downloadAudioToBuffer(audioUrl);
- const ext = audioUrl.includes('.flac') ? '.flac' : '.mp3';
+ let ext = '.mp3';
+ if (audioUrl.endsWith('.flac')) ext = '.flac';
+ else if (audioUrl.endsWith('.wav')) ext = '.wav';
const storageKey = `${req.user!.id}/${songId}${ext}`;
- await storage.upload(storageKey, buffer, `audio/${ext.slice(1)}`);
+ // Move the intermediate job file directly to its library location to avoid storing
+ // a duplicate copy of the (potentially large) audio file on disk.
+ const { rename, mkdir } = await import('fs/promises');
+ const srcPath = path.join(config.storage.audioDir, audioUrl.slice('/audio/'.length));
+ const dstDir = path.join(config.storage.audioDir, req.user!.id);
+ const dstPath = path.join(dstDir, `${songId}${ext}`);
+ await mkdir(dstDir, { recursive: true });
+ await rename(srcPath, dstPath);
const storedPath = storage.getPublicUrl(storageKey);
await pool.query(
@@ -710,6 +739,35 @@ router.get('/debug/:taskId', authMiddleware, async (req: AuthenticatedRequest, r
}
});
+// ── Debug log endpoints ───────────────────────────────────────────────────────
+
+/** List all in-memory jobs (for the debug panel job selector). */
+router.get('/logs', logRateLimiter, authMiddleware, async (_req: AuthenticatedRequest, res: Response) => {
+ try {
+ res.json({ jobs: listActiveJobs() });
+ } catch (error) {
+ res.status(500).json({ error: (error as Error).message });
+ }
+});
+
+/**
+ * Stream log lines for a specific job.
+ * Query param `after` (integer) returns only lines after that index for efficient polling.
+ */
+router.get('/logs/:jobId', logRateLimiter, authMiddleware, async (req: AuthenticatedRequest, res: Response) => {
+ try {
+ const after = parseInt(req.query.after as string || '0', 10);
+ const result = getJobLogs(req.params.jobId, isNaN(after) ? 0 : after);
+ if (!result) {
+ res.status(404).json({ error: 'Job not found' });
+ return;
+ }
+ res.json(result);
+ } catch (error) {
+ res.status(500).json({ error: (error as Error).message });
+ }
+});
+
// Format endpoint - uses LLM to enhance style/lyrics
// Spawn mode: runs `acestep-generate --mode format` with the prompt/lyrics as args
// HTTP mode: calls ACESTEP_API_URL/format_input
diff --git a/server/src/routes/referenceTrack.ts b/server/src/routes/referenceTrack.ts
index 4510b2f..542bf5c 100644
--- a/server/src/routes/referenceTrack.ts
+++ b/server/src/routes/referenceTrack.ts
@@ -4,16 +4,27 @@ import path from 'path';
import os from 'os';
import { promises as fs } from 'fs';
import { fileURLToPath } from 'url';
+import rateLimit from 'express-rate-limit';
import { pool } from '../db/pool.js';
import { authMiddleware, AuthenticatedRequest } from '../middleware/auth.js';
import { getStorageProvider } from '../services/storage/factory.js';
import { spawn } from 'child_process';
+import { runUnderstand } from '../services/acestep.js';
const router = Router();
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const AUDIO_DIR = path.join(__dirname, '../../public/audio');
+// Per-IP rate limiter for CPU-intensive understand operations (max 6 requests per minute)
+const understandRateLimiter = rateLimit({
+ windowMs: 60_000,
+ max: 6,
+ standardHeaders: true,
+ legacyHeaders: false,
+ message: { error: 'Too many requests — please wait before analysing another track' },
+});
+
const upload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: 50 * 1024 * 1024 }, // 50MB max
@@ -322,4 +333,48 @@ router.delete('/:id', authMiddleware, async (req: AuthenticatedRequest, res: Res
}
});
+// Understand a reference track with ace-understand
+router.post('/:id/understand', understandRateLimiter, authMiddleware, async (req: AuthenticatedRequest, res: Response) => {
+ try {
+ const result = await pool.query(
+ 'SELECT user_id, storage_key FROM reference_tracks WHERE id = $1',
+ [req.params.id]
+ );
+ if (result.rows.length === 0) {
+ res.status(404).json({ error: 'Track not found' });
+ return;
+ }
+ if (result.rows[0].user_id !== req.user!.id) {
+ res.status(403).json({ error: 'Access denied' });
+ return;
+ }
+
+ const audioUrl = `/audio/${result.rows[0].storage_key}`;
+ const understood = await runUnderstand(audioUrl);
+ res.json(understood);
+ } catch (error) {
+ const msg = error instanceof Error ? error.message : 'Failed to understand audio';
+ console.error('Understand reference track error:', error);
+ res.status(500).json({ error: msg });
+ }
+});
+
+// Understand audio by URL (for source/generated audio without a reference track DB entry)
+router.post('/understand-url', understandRateLimiter, authMiddleware, async (req: AuthenticatedRequest, res: Response) => {
+ const { audioUrl } = req.body as { audioUrl?: string };
+ if (!audioUrl || typeof audioUrl !== 'string') {
+ res.status(400).json({ error: 'audioUrl is required' });
+ return;
+ }
+
+ try {
+ const understood = await runUnderstand(audioUrl);
+ res.json(understood);
+ } catch (error) {
+ const msg = error instanceof Error ? error.message : 'Failed to understand audio';
+ console.error('Understand URL error:', error);
+ res.status(500).json({ error: msg });
+ }
+});
+
export default router;
diff --git a/server/src/services/acestep.ts b/server/src/services/acestep.ts
index a6b975c..d62a2b9 100644
--- a/server/src/services/acestep.ts
+++ b/server/src/services/acestep.ts
@@ -9,9 +9,10 @@
*/
import { spawn } from 'child_process';
-import { writeFile, mkdir, readFile } from 'fs/promises';
+import { writeFile, mkdir, readFile, mkdtemp, rm } from 'fs/promises';
import { execFileSync } from 'child_process';
import { existsSync, readdirSync } from 'fs';
+import { tmpdir } from 'os';
import path from 'path';
import { fileURLToPath } from 'url';
import { config } from '../config/index.js';
@@ -41,7 +42,7 @@ function getAudioDuration(filePath: string): number {
// ---------------------------------------------------------------------------
export interface GenerationParams {
- customMode: boolean;
+ customMode?: boolean; // kept for backward compatibility; ignored in unified mode
songDescription?: string;
lyrics: string;
style: string;
@@ -59,7 +60,7 @@ export interface GenerationParams {
seed?: number;
thinking?: boolean;
enhance?: boolean;
- audioFormat?: 'mp3' | 'flac';
+ audioFormat?: 'wav' | 'mp3';
inferMethod?: 'ode' | 'sde';
shift?: number;
lmTemperature?: number;
@@ -82,10 +83,20 @@ export interface GenerationParams {
useAdg?: boolean;
cfgIntervalStart?: number;
cfgIntervalEnd?: number;
+ customTimesteps?: string;
useCotMetas?: boolean;
useCotCaption?: boolean;
useCotLanguage?: boolean;
autogen?: boolean;
+ constrainedDecodingDebug?: boolean;
+ allowLmBatch?: boolean;
+ getScores?: boolean;
+ getLrc?: boolean;
+ scoreScale?: number;
+ lmBatchChunkSize?: number;
+ trackName?: string;
+ completeTrackClasses?: string[];
+ isFormatCaption?: boolean;
ditModel?: string;
}
@@ -118,6 +129,8 @@ interface ActiveJob {
queuePosition?: number;
progress?: number;
stage?: string;
+ /** All raw lines emitted by ace-qwen3 / dit-vae (stdout + stderr), in order. */
+ logs: string[];
}
const activeJobs = new Map();
@@ -308,8 +321,20 @@ function runBinary(
let stdout = '';
let stderr = '';
let lineBuf = '';
+ let stdoutLineBuf = '';
- proc.stdout.on('data', (chunk: Buffer) => { stdout += chunk.toString(); });
+ proc.stdout.on('data', (chunk: Buffer) => {
+ const text = chunk.toString();
+ stdout += text;
+ // Stream stdout lines to onLine as well so they appear in the debug log
+ stdoutLineBuf += text;
+ const lines = stdoutLineBuf.split('\n');
+ stdoutLineBuf = lines.pop() ?? '';
+ for (const line of lines) {
+ const trimmed = line.trim();
+ if (trimmed && onLine) onLine(`[stdout] ${trimmed}`);
+ }
+ });
proc.stderr.on('data', (chunk: Buffer) => {
const text = chunk.toString();
stderr += text;
@@ -324,8 +349,10 @@ function runBinary(
proc.on('close', (code) => {
// Flush any partial last line that didn't end with a newline
+ if (stdoutLineBuf.trim() && onLine) onLine(`[stdout] ${stdoutLineBuf.trim()}`);
if (lineBuf.trim() && onLine) onLine(lineBuf.trim());
lineBuf = '';
+ stdoutLineBuf = '';
if (code === 0) {
resolve({ stdout, stderr });
@@ -382,6 +409,9 @@ function makeLmProgressHandler(job: ActiveJob): (line: string) => void {
const PHASE1_STEP_CEIL = 400;
return (line: string) => {
+ // Always capture the raw line for the debug log
+ job.logs.push(line);
+
// Phase1 LM decode: "[Phase1] step 100, 1 active, 19.0 tok/s"
const p1 = line.match(/^\[Phase1\] step (\d+),.*?([\d.]+) tok\/s/);
if (p1) {
@@ -426,6 +456,9 @@ function makeDitVaeProgressHandler(job: ActiveJob): (line: string) => void {
let ditTotalSteps = 8;
return (line: string) => {
+ // Always capture the raw line for the debug log
+ job.logs.push(line);
+
// DiT starting — capture step count: "[DiT] Starting: T=3470, S=1735, …, steps=8, …"
const ditStart = line.match(/^\[DiT\] Starting:.*?steps=(\d+)/);
if (ditStart) {
@@ -483,18 +516,18 @@ async function runViaSpawn(
const taskType = params.taskType || 'text2music';
const isCover = taskType === 'cover' || taskType === 'audio2audio';
const isRepaint = taskType === 'repaint';
+ const isLego = taskType === 'lego';
// Passthrough: taskType explicitly set, or audio codes provided without
// a source audio file (legacy callers that omit the taskType field).
const isPassthru = taskType === 'passthrough' || Boolean(params.audioCodes && !params.sourceAudioUrl);
// LLM (ace-qwen3) is only needed for plain text-to-music generation.
- // Cover, repaint, and passthrough all skip it.
- const skipLm = isCover || isRepaint || isPassthru;
+ // Cover, repaint, lego, and passthrough all skip it.
+ const skipLm = isCover || isRepaint || isLego || isPassthru;
// ── Debug: log what the UI/API client requested ──────────────────────────
console.log(
`[Job ${jobId}] Request received:` +
`\n mode = ${taskType}` +
- `\n customMode = ${params.customMode}` +
`\n ditModel = ${params.ditModel || '(default)'}` +
`\n sourceAudio = ${params.sourceAudioUrl || 'none'}` +
`\n repaintRegion = [${params.repaintingStart ?? 'start'}, ${params.repaintingEnd ?? 'end'}]` +
@@ -513,7 +546,8 @@ async function runViaSpawn(
// (cover / repaint / passthrough). Only include the fields each binary
// actually understands so the format stays clean and predictable.
const caption = params.style || 'pop music';
- const prompt = params.customMode ? caption : (params.songDescription || caption);
+ // Use song description when provided (user's natural-language intent), falling back to style/caption
+ const prompt = params.songDescription || caption;
// Instrumental: pass the special "[Instrumental]" lyrics marker so the LLM
// skips lyrics generation (as documented in the acestep.cpp README).
const lyrics = params.instrumental ? '[Instrumental]' : (params.lyrics || '');
@@ -535,7 +569,7 @@ async function runViaSpawn(
if (params.timeSignature) requestJson.timesignature = params.timeSignature;
if (skipLm) {
- // ── Cover / repaint / passthrough: ace-qwen3 is skipped ─────────────
+ // ── Cover / repaint / lego / passthrough: ace-qwen3 is skipped ──────
// Add only the mode-specific fields that dit-vae cares about.
if (isPassthru) {
if (!params.audioCodes) {
@@ -554,6 +588,25 @@ async function runViaSpawn(
// Note: sourceAudioUrl is guaranteed here — validated in processGeneration.
requestJson.repainting_start = params.repaintingStart ?? -1;
requestJson.repainting_end = params.repaintingEnd ?? -1;
+ } else if (isLego) {
+ // Lego: generate a new instrument track layered over an existing backing track.
+ // Requires the base model (acestep-v15-base) and --src-audio.
+ // The "lego" field holds the track name (e.g. "guitar", "drums").
+ if (!params.trackName) {
+ throw new Error("task_type='lego' requires a track name (e.g. 'guitar')");
+ }
+ requestJson.lego = params.trackName;
+ // Which existing tracks are "complete" and should not be overwritten.
+ if (params.completeTrackClasses && params.completeTrackClasses.length > 0) {
+ requestJson.complete_track_classes = params.completeTrackClasses;
+ }
+ // Lego has strict parameter requirements per the spec — always enforce them
+ // regardless of what the frontend sent, so the binary never rejects the request.
+ requestJson.inference_steps = 50;
+ requestJson.guidance_scale = 7.0;
+ // shift=1.0 is a hard requirement for lego (the spec example always uses 1.0;
+ // using the normal default of 3.0 causes dit-vae to reject the request).
+ requestJson.shift = 1.0;
}
} else {
// ── Text-to-music: include LM parameters for ace-qwen3 ──────────────
@@ -563,12 +616,15 @@ async function runViaSpawn(
requestJson.lm_top_p = params.lmTopP ?? 0.9;
requestJson.lm_top_k = params.lmTopK ?? 0;
requestJson.lm_negative_prompt = params.lmNegativePrompt || '';
+ requestJson.use_cot_caption = params.useCotCaption ?? true;
}
const requestPath = path.join(tmpDir, 'request.json');
await writeFile(requestPath, JSON.stringify(requestJson, null, 2));
console.log(`[Job ${jobId}] Request JSON written to ${requestPath}:`);
console.log(JSON.stringify(requestJson, null, 2));
+ job.logs.push(`=== Job ${jobId} started — mode: ${taskType} ===`);
+ job.logs.push(`Request JSON: ${JSON.stringify(requestJson, null, 2)}`);
// ── Step 1: ace-qwen3 — LLM (lyrics + audio codes) ────────────────────
// Skipped when:
@@ -590,7 +646,9 @@ async function runViaSpawn(
if (batchSize > 1) lmArgs.push('--batch', String(batchSize));
lmArgs.push(...parseExtraArgs(process.env.ACE_QWEN3_EXTRA_ARGS));
- console.log(`[Job ${jobId}] Running ace-qwen3:\n ${lmBin} ${lmArgs.join(' ')}`);
+ const lmCmd = `${lmBin} ${lmArgs.join(' ')}`;
+ console.log(`[Job ${jobId}] Running ace-qwen3:\n ${lmCmd}`);
+ job.logs.push(`\n--- Running ace-qwen3 ---\n$ ${lmCmd}`);
await runBinary(lmBin, lmArgs, 'ace-qwen3', undefined, makeLmProgressHandler(job));
// Collect enriched JSON files produced by ace-qwen3:
@@ -618,9 +676,25 @@ async function runViaSpawn(
const ditVaeBin = config.acestep.ditVaeBin!;
const textEncoderModel = config.acestep.textEncoderModel;
- const ditModel = resolveParamDitModel(params.ditModel);
const vaeModel = config.acestep.vaeModel;
+ // Lego mode mandates the base DiT model — no other variant will work.
+ // Override whatever the frontend sent and fail early with a clear message
+ // if the base model has not been downloaded yet.
+ let ditModel: string;
+ if (isLego) {
+ const baseModel = config.acestep.baseModel;
+ if (!baseModel) {
+ throw new Error(
+ 'Lego mode requires the base DiT model (acestep-v15-base) ' +
+ '— download it via the Model Manager first'
+ );
+ }
+ ditModel = baseModel;
+ } else {
+ ditModel = resolveParamDitModel(params.ditModel);
+ }
+
if (!textEncoderModel) throw new Error('Text-encoder model not found — run models.sh first');
if (!ditModel) throw new Error('DiT model not found — run models.sh first');
if (!vaeModel) throw new Error('VAE model not found — run models.sh first');
@@ -655,20 +729,30 @@ async function runViaSpawn(
ditArgs.push('--lora-scale', String(loraState.scale));
}
+ // WAV format: pass --wav so the binary outputs WAV; MP3 (default): no flag,
+ // the binary outputs MP3 natively (upstream acestep-cpp has native MP3 support).
+ const wantWav = (params.audioFormat === 'wav');
+ if (wantWav) {
+ ditArgs.push('--wav');
+ }
+
ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS));
- console.log(`[Job ${jobId}] Running dit-vae:\n ${ditVaeBin} ${ditArgs.join(' ')}`);
+ const ditCmd = `${ditVaeBin} ${ditArgs.join(' ')}`;
+ console.log(`[Job ${jobId}] Running dit-vae:\n ${ditCmd}`);
+ job.logs.push(`\n--- Running dit-vae ---\n$ ${ditCmd}`);
await runBinary(ditVaeBin, ditArgs, 'dit-vae', undefined, makeDitVaeProgressHandler(job));
- // ── Collect generated WAV files ─────────────────────────────────────────
- // dit-vae places output WAVs alongside each enriched JSON:
- // request0.json → request00.wav, request01.wav, …
- // request1.json → request10.wav, request11.wav, …
+ // ── Collect generated audio files ──────────────────────────────────────
+ // dit-vae places output files alongside each enriched JSON:
+ // With --wav: request0.json → request00.wav, request01.wav, …
+ // Without --wav: request0.json → request00.mp3, request01.mp3, …
const { copyFile, rm } = await import('fs/promises');
+ const finalExt = wantWav ? 'wav' : 'mp3';
let rawAudioPaths: string[] = [];
try {
rawAudioPaths = readdirSync(tmpDir)
- .filter(f => /^request\d+\.wav$/.test(f))
+ .filter(f => new RegExp(`^request\\d+\\.${finalExt}$`).test(f))
.sort()
.map(f => path.join(tmpDir, f));
} catch { /* ignore */ }
@@ -677,10 +761,10 @@ async function runViaSpawn(
throw new Error('dit-vae produced no audio files');
}
- // Move WAVs to AUDIO_DIR with a stable, job-scoped name
+ // Copy files to AUDIO_DIR with a stable, job-scoped name
const audioPaths: string[] = [];
for (let i = 0; i < rawAudioPaths.length; i++) {
- const dest = path.join(AUDIO_DIR, `${jobId}_${i}.wav`);
+ const dest = path.join(AUDIO_DIR, `${jobId}_${i}.${finalExt}`);
await copyFile(rawAudioPaths[i], dest);
audioPaths.push(dest);
}
@@ -706,12 +790,18 @@ async function runViaSpawn(
status: 'succeeded',
};
job.rawResponse = enrichedMeta;
+ job.logs.push(`\n=== Job ${jobId} completed successfully — ${audioUrls.length} file(s): ${audioUrls.join(', ')} ===`);
console.log(`[Job ${jobId}] Completed successfully with ${audioUrls.length} audio file(s): ${audioUrls.join(', ')}`);
// Clean up tmp directory
await rm(tmpDir, { recursive: true, force: true }).catch(() => { /* best-effort */ });
} catch (err) {
+ // Append error to the debug log before re-throwing
+ if (activeJobs.has(jobId)) {
+ const j = activeJobs.get(jobId)!;
+ j.logs.push(`\n=== Job ${jobId} FAILED: ${(err as Error).message} ===`);
+ }
// Best-effort cleanup on failure
try {
const { rm } = await import('fs/promises');
@@ -727,7 +817,7 @@ async function runViaSpawn(
function buildHttpRequest(params: GenerationParams): Record {
const caption = params.style || 'pop music';
- const prompt = params.customMode ? caption : (params.songDescription || caption);
+ const prompt = params.songDescription || caption;
const lyrics = params.instrumental ? '' : (params.lyrics || '');
const isThinking = params.thinking ?? false;
const isEnhance = params.enhance ?? false;
@@ -918,6 +1008,7 @@ export async function generateMusicViaAPI(params: GenerationParams): Promise<{ j
startTime: Date.now(),
status: 'queued',
queuePosition: jobQueue.length + 1,
+ logs: [],
};
activeJobs.set(jobId, job);
@@ -945,7 +1036,6 @@ async function processGeneration(
console.log(
`[Job ${jobId}] Starting generation (${mode} mode):` +
`\n taskType = ${params.taskType || 'text2music'}` +
- `\n customMode = ${params.customMode}` +
`\n ditModel = ${params.ditModel || '(default)'}` +
`\n sourceAudio = ${params.sourceAudioUrl || 'none'}` +
`\n audioCodes = ${params.audioCodes ? '[provided]' : 'none'}`
@@ -966,6 +1056,13 @@ async function processGeneration(
return;
}
+ if (params.taskType === 'lego' && !params.sourceAudioUrl) {
+ job.status = 'failed';
+ job.error = "task_type='lego' requires a source audio (--src-audio)";
+ console.error(`[Job ${jobId}] Validation failed: ${job.error}`);
+ return;
+ }
+
try {
job.stage = 'Generating music...';
if (useSpawnMode(params)) {
@@ -1014,6 +1111,113 @@ export function getJobRawResponse(jobId: string): unknown | null {
return activeJobs.get(jobId)?.rawResponse ?? null;
}
+/**
+ * Returns the captured log lines for a job (all raw output from ace-qwen3 + dit-vae).
+ * Optionally accepts an `after` offset to return only new lines since the last poll.
+ */
+export function getJobLogs(jobId: string, after = 0): { lines: string[]; total: number; status: string } | null {
+ const job = activeJobs.get(jobId);
+ if (!job) return null;
+ return {
+ lines: job.logs.slice(after),
+ total: job.logs.length,
+ status: job.status,
+ };
+}
+
+/**
+ * Returns a summary of all in-memory jobs (most recent first), for the debug log list.
+ */
+export function listActiveJobs(): Array<{ jobId: string; status: string; startTime: number; stage?: string; logCount: number }> {
+ const result: Array<{ jobId: string; status: string; startTime: number; stage?: string; logCount: number }> = [];
+ for (const [jobId, job] of activeJobs) {
+ result.push({ jobId, status: job.status, startTime: job.startTime, stage: job.stage, logCount: job.logs.length });
+ }
+ return result.sort((a, b) => b.startTime - a.startTime);
+}
+
+// ---------------------------------------------------------------------------
+// Ace Understand — reverse pipeline: audio → metadata + lyrics
+// ---------------------------------------------------------------------------
+
+export interface UnderstandResult {
+ caption?: string;
+ lyrics?: string;
+ bpm?: number;
+ duration?: number;
+ keyscale?: string;
+ timesignature?: string;
+ vocal_language?: string;
+ seed?: number;
+ inference_steps?: number;
+ guidance_scale?: number;
+ shift?: number;
+ audio_cover_strength?: number;
+ repainting_start?: number;
+ repainting_end?: number;
+ lm_temperature?: number;
+ lm_cfg_scale?: number;
+ lm_top_p?: number;
+ lm_top_k?: number;
+ lm_negative_prompt?: string;
+ use_cot_caption?: boolean;
+ audio_codes?: string;
+ [key: string]: unknown;
+}
+
+/**
+ * Run ace-understand on a source audio file and return the parsed result JSON.
+ *
+ * The binary performs a reverse pipeline: VAE-encodes the audio, FSQ-tokenises
+ * the latent, then uses the LM to generate metadata (caption, lyrics, bpm, etc.)
+ * — the same fields that ace-qwen3 would fill for generation.
+ */
+export async function runUnderstand(audioUrl: string): Promise {
+ const understandBin = config.acestep.understandBin;
+ if (!understandBin) {
+ throw new Error('ace-understand binary not found — rebuild acestep.cpp or set ACE_UNDERSTAND_BIN');
+ }
+
+ const lmModel = config.acestep.lmModel;
+ const ditModel = config.acestep.ditModel;
+ const vaeModel = config.acestep.vaeModel;
+
+ if (!lmModel) throw new Error('LM model not found — run models.sh first');
+ if (!ditModel) throw new Error('DiT model not found — run models.sh first');
+ if (!vaeModel) throw new Error('VAE model not found — run models.sh first');
+
+ const srcAudioPath = resolveAudioPath(audioUrl);
+ if (!existsSync(srcAudioPath)) {
+ throw new Error(`Audio file not found: ${srcAudioPath}`);
+ }
+
+ // Write output JSON to a temp file so we can parse it reliably.
+ const tmpDir = await mkdtemp(path.join(tmpdir(), 'ace-understand-'));
+ const outJsonPath = path.join(tmpDir, 'understand.json');
+
+ try {
+ const args: string[] = [
+ '--src-audio', srcAudioPath,
+ '--dit', ditModel,
+ '--vae', vaeModel,
+ '--model', lmModel,
+ '-o', outJsonPath,
+ ];
+
+ console.log(`[understand] Running ace-understand:\n ${understandBin} ${args.join(' ')}`);
+
+ await runBinary(understandBin, args, 'ace-understand');
+
+ // Read and parse the output JSON
+ const raw = await readFile(outJsonPath, 'utf-8');
+ const result: UnderstandResult = JSON.parse(raw);
+ console.log('[understand] Result:', JSON.stringify(result, null, 2));
+ return result;
+ } finally {
+ await rm(tmpDir, { recursive: true, force: true }).catch(() => { /* best-effort */ });
+ }
+}
+
export async function discoverEndpoints(): Promise {
const mode = useSpawnMode() ? 'spawn' : 'http';
return {
diff --git a/services/api.ts b/services/api.ts
index 20a5436..9483dbc 100644
--- a/services/api.ts
+++ b/services/api.ts
@@ -266,7 +266,7 @@ export interface GenerationParams {
randomSeed?: boolean;
seed?: number;
thinking?: boolean;
- audioFormat?: 'mp3' | 'flac';
+ audioFormat?: 'wav' | 'mp3';
inferMethod?: 'ode' | 'sde';
shift?: number;
@@ -418,6 +418,12 @@ export const generateApi = {
scale: number;
path: string;
}> => api('/api/lora/status', { token }),
+
+ understandReferenceTrack: (trackId: string, token: string): Promise> =>
+ api(`/api/reference-tracks/${trackId}/understand`, { method: 'POST', token }),
+
+ understandAudioUrl: (audioUrl: string, token: string): Promise> =>
+ api('/api/reference-tracks/understand-url', { method: 'POST', body: { audioUrl }, token }),
};
// Users API
diff --git a/types.ts b/types.ts
index 8f3927a..cca52c0 100644
--- a/types.ts
+++ b/types.ts
@@ -79,7 +79,7 @@ export interface GenerationParams {
seed: number;
thinking: boolean;
enhance?: boolean;
- audioFormat: 'mp3' | 'flac';
+ audioFormat: 'wav' | 'mp3';
inferMethod: 'ode' | 'sde';
shift: number;
@@ -152,4 +152,4 @@ export interface UserProfile {
}
// Simplified views for ACE-Step UI
-export type View = 'create' | 'library' | 'models' | 'profile' | 'song' | 'playlist' | 'search' | 'news';
+export type View = 'create' | 'library' | 'models' | 'profile' | 'song' | 'playlist' | 'search' | 'news' | 'debug';