Skip to content

Commit 54a186d

Browse files
mp3: 3-axis quant_compare, PE allocation, conditional MS (+0.037 ODG)
Outer loop: compare peak distortion, then average, then band count instead of just counting over-threshold bands. Spreads noise evenly. Bit allocation: weight granule budget by perceptual entropy (20-80% clamp) instead of blind 50/50 split. MS stereo: disable when side energy > 1.2x mid (uncorrelated content). PEAQ ITU-R BS.1387 at 128kbps (peaqb-fast): Encoder ODG RmsNoiseLoud RelDistFrames License mp3enc (ours) -1.02 0.34 8.8% MIT mp3enc (prev) -1.05 0.54 9.3% MIT LAME 3.100 -0.90 0.26 6.3% LGPL
1 parent 8c7954e commit 54a186d

2 files changed

Lines changed: 156 additions & 51 deletions

File tree

mp3/mp3enc-quant.h

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -489,29 +489,72 @@ static int mp3enc_outer_loop(const float * xr,
489489
// Max 25 passes: enough for convergence at all bitrates.
490490
float noise[22]; // 22 SFB bands before table terminator
491491
int best_ix[576];
492-
mp3enc_granule_info best_gi = gi;
493-
int best_total = gi.part2_3_length;
494-
int best_over = 21; // start pessimistic
492+
mp3enc_granule_info best_gi = gi;
493+
int best_total = gi.part2_3_length;
494+
int best_over = 21; // start pessimistic
495+
float best_max_db = 999.0f; // worst-band noise in dB over threshold
496+
float best_tot_db = 999.0f; // total over-threshold noise in dB
495497
memcpy(best_ix, ix, sizeof(best_ix));
496498

497499
for (int iter = 0; iter < 25; iter++) {
498500
// Compute noise per SFB with current quantization
499501
mp3enc_calc_noise(xr, ix, gi.global_gain, gi.scalefac_l, gi.scalefac_scale, gi.preflag, sfb_table, noise);
500502

501-
// Count bands over threshold
502-
int over_count = 0;
503+
// Compute noise metrics for 3-axis comparison (GPSYCHO approach).
504+
// Instead of just counting bands over threshold, track:
505+
// - max_over_db: worst violation in dB (peak distortion)
506+
// - tot_over_db: sum of violations in dB (for average)
507+
// - over_count: number of distorted bands
508+
// This prefers solutions that minimize peak distortion and spread
509+
// remaining noise evenly, rather than concentrating it in one band.
510+
int over_count = 0;
511+
float max_over_db = 0.0f;
512+
float tot_over_db = 0.0f;
513+
503514
for (int sfb = 0; sfb < 21; sfb++) {
504515
if (xmin[sfb] > 0.0f && noise[sfb] > xmin[sfb]) {
505516
over_count++;
517+
float over_db = 10.0f * log10f(noise[sfb] / xmin[sfb]);
518+
tot_over_db += over_db;
519+
if (over_db > max_over_db) {
520+
max_over_db = over_db;
521+
}
522+
}
523+
}
524+
525+
// 3-axis quant_compare (inspired by LAME GPSYCHO outer_loop):
526+
// 1. Clean (over=0) always beats dirty (over>0)
527+
// 2. Among clean solutions: prefer fewer bits
528+
// 3. Among dirty solutions: minimize peak, then average, then count
529+
bool is_better = false;
530+
if (over_count == 0 && best_over > 0) {
531+
is_better = true;
532+
} else if (over_count == 0 && best_over == 0) {
533+
is_better = (gi.part2_3_length < best_total);
534+
} else if (over_count > 0 && best_over > 0) {
535+
// both dirty: compare peak distortion first
536+
if (max_over_db < best_max_db - 0.5f) {
537+
// significantly lower peak -> better
538+
is_better = true;
539+
} else if (max_over_db < best_max_db + 0.5f) {
540+
// similar peak: compare average violation
541+
float avg = tot_over_db / (float) over_count;
542+
float best_avg = (best_over > 0) ? best_tot_db / (float) best_over : 0.0f;
543+
if (avg < best_avg - 0.3f) {
544+
is_better = true;
545+
} else if (avg < best_avg + 0.3f) {
546+
// similar average: prefer fewer violated bands
547+
is_better = (over_count < best_over);
548+
}
506549
}
507550
}
508551

509-
// Track best result: prefer fewer over-threshold bands,
510-
// then fewer total bits as tiebreaker.
511-
if (over_count < best_over || (over_count == best_over && gi.part2_3_length < best_total)) {
512-
best_gi = gi;
513-
best_total = gi.part2_3_length;
514-
best_over = over_count;
552+
if (is_better) {
553+
best_gi = gi;
554+
best_total = gi.part2_3_length;
555+
best_over = over_count;
556+
best_max_db = max_over_db;
557+
best_tot_db = tot_over_db;
515558
memcpy(best_ix, ix, sizeof(best_ix));
516559
}
517560

mp3/mp3enc.h

Lines changed: 102 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,11 @@ static int mp3enc_encode_frame(mp3enc_t * enc, const float * pcm) {
214214
int nch = enc->channels;
215215
int padding = mp3enc_get_padding(enc);
216216

217-
// Use MS stereo for stereo input (joint stereo mode)
218-
// mode=1 (joint), mode_ext=2 (MS on, intensity off)
217+
// MS stereo decision is deferred until after MDCT energy analysis.
218+
// mode=1 (joint) allows switching M/S on or off per frame.
219+
// mode=0 (stereo) would force L/R always. mode=3 (mono).
219220
int mode = (nch == 1) ? 3 : 1;
220-
int mode_ext = (nch == 1) ? 0 : 2;
221+
int mode_ext = 0; // set after MDCT energy analysis
221222

222223
// Setup header
223224
mp3enc_header hdr;
@@ -257,74 +258,136 @@ static int mp3enc_encode_frame(mp3enc_t * enc, const float * pcm) {
257258
// Total main_data bits: this frame's area + reservoir from previous frame
258259
int total_md_bits = main_data_bits + resv_bytes * 8;
259260

260-
// Mean bits per granule (from total budget)
261-
int mean_bits = total_md_bits / 2;
261+
// Phase 1: compute MDCT + psy for ALL granules before bit allocation.
262+
// We need PE from both granules to weight the bit budget.
263+
float mdct_all[2][2][576]; // [granule][channel][576 MDCT lines]
264+
float saved_xmin[2][2][MP3ENC_PSY_SFB_MAX];
265+
float saved_pe[2] = { 0.0f, 0.0f };
262266

263-
int ix[2][2][576]; // [granule][channel][line]
264-
float mdct_lr[2][576]; // MDCT output per channel before M/S transform
265-
266-
int total_bits_used = 0;
267-
int intra_resv = 0; // intra-frame reservoir: bits saved by granule 0 for granule 1
267+
// Accumulators for MS stereo decision
268+
float energy_mid = 0.0f;
269+
float energy_side = 0.0f;
268270

269271
for (int gr = 0; gr < 2; gr++) {
270272
int pcm_offset = gr * 576;
271273

272-
// Step 1: filterbank + MDCT for all channels
274+
// filterbank + MDCT for all channels
273275
for (int ch = 0; ch < nch; ch++) {
274276
const float * ch_pcm = pcm + ch * 1152 + pcm_offset;
275-
276-
// Run analysis filterbank: 576 PCM samples = 18 calls of 32 samples
277-
float sb_out[32];
277+
float sb_out[32];
278278
for (int slot = 0; slot < 18; slot++) {
279279
enc->filter[ch].process(ch_pcm + slot * 32, sb_out);
280280
for (int sb = 0; sb < 32; sb++) {
281281
enc->sb_cur[ch][sb][slot] = sb_out[sb];
282282
}
283-
284283
// frequency inversion: negate odd subbands at odd time slots
285284
if (slot & 1) {
286285
for (int sb = 1; sb < 32; sb += 2) {
287286
enc->sb_cur[ch][sb][slot] = -enc->sb_cur[ch][sb][slot];
288287
}
289288
}
290289
}
291-
292-
// MDCT: transform subbands to 576 frequency lines
293-
mp3enc_mdct_granule(enc->sb_prev[ch], enc->sb_cur[ch], mdct_lr[ch]);
294-
295-
// Save current subbands as previous for next granule
290+
mp3enc_mdct_granule(enc->sb_prev[ch], enc->sb_cur[ch], mdct_all[gr][ch]);
296291
memcpy(enc->sb_prev[ch], enc->sb_cur[ch], sizeof(enc->sb_cur[ch]));
297292
}
298293

299-
// Step 2: MS stereo transform
294+
// MS stereo energy analysis (before transform, on L/R data).
295+
// Accumulate mid and side energy across both granules to decide
296+
// whether M/S coding is beneficial for this frame.
297+
// M/S wins when L and R are correlated (side energy is small).
300298
if (nch == 2) {
299+
for (int i = 0; i < enc->lowpass_line && i < 576; i++) {
300+
float l = mdct_all[gr][0][i];
301+
float r = mdct_all[gr][1][i];
302+
float m = l + r;
303+
float s = l - r;
304+
energy_mid += m * m;
305+
energy_side += s * s;
306+
}
307+
}
308+
}
309+
310+
// MS stereo decision: use M/S when channels are correlated enough
311+
// that the side channel is cheap to encode. FhG "almost always uses
312+
// ms_stereo" (GPSYCHO docs). We use it unless side energy dominates,
313+
// which means the channels are very different (rare for music).
314+
bool use_ms = false;
315+
if (nch == 2) {
316+
// Use M/S unless side channel has more energy than mid.
317+
// This is generous -- almost always enables M/S (like FhG).
318+
use_ms = (energy_side < energy_mid * 1.2f) || (energy_mid < 1e-20f);
319+
mode_ext = use_ms ? 2 : 0;
320+
hdr.mode_ext = mode_ext;
321+
}
322+
323+
// Now apply M/S transform + lowpass + psy for both granules
324+
for (int gr = 0; gr < 2; gr++) {
325+
// MS stereo transform
326+
if (use_ms) {
301327
static const float ms_scale = 0.7071067811865476f; // 1/sqrt(2)
302328
for (int i = 0; i < 576; i++) {
303-
float l = mdct_lr[0][i];
304-
float r = mdct_lr[1][i];
305-
mdct_lr[0][i] = (l + r) * ms_scale;
306-
mdct_lr[1][i] = (l - r) * ms_scale;
329+
float l = mdct_all[gr][0][i];
330+
float r = mdct_all[gr][1][i];
331+
mdct_all[gr][0][i] = (l + r) * ms_scale;
332+
mdct_all[gr][1][i] = (l - r) * ms_scale;
307333
}
308334
}
309335

310-
// Step 2b: adaptive lowpass
336+
// adaptive lowpass
311337
for (int ch = 0; ch < nch; ch++) {
312338
for (int i = enc->lowpass_line; i < 576; i++) {
313-
mdct_lr[ch][i] = 0.0f;
339+
mdct_all[gr][ch][i] = 0.0f;
314340
}
315341
}
316342

317-
// Step 3: run psy for all channels before bit allocation
318-
float saved_xmin[2][MP3ENC_PSY_SFB_MAX];
343+
// psy: compute masking thresholds and perceptual entropy
319344
for (int ch = 0; ch < nch; ch++) {
320-
enc->psy.compute(mdct_lr[ch], sfb_long, enc->sr_index, ch);
321-
memcpy(saved_xmin[ch], enc->psy.xmin, sizeof(enc->psy.xmin));
345+
enc->psy.compute(mdct_all[gr][ch], sfb_long, enc->sr_index, ch);
346+
memcpy(saved_xmin[gr][ch], enc->psy.xmin, sizeof(enc->psy.xmin));
347+
saved_pe[gr] += enc->psy.pe;
322348
}
349+
}
323350

324-
// Step 4: bit allocation
325-
int max_bits = mean_bits + intra_resv;
351+
// Phase 2: PE-weighted bit allocation + quantization.
352+
// Give more bits to granules with higher perceptual entropy (more complex
353+
// signal). ISO 11172-3 computes PE but then ignores it for allocation;
354+
// LAME uses it to drive the bit reservoir. We weight the per-granule
355+
// budget proportionally to PE, clamped to avoid starving either granule.
356+
int gr_budget[2];
357+
float pe_sum = saved_pe[0] + saved_pe[1];
358+
if (pe_sum > 1e-6f) {
359+
float frac0 = saved_pe[0] / pe_sum;
360+
gr_budget[0] = (int) ((float) total_md_bits * frac0);
361+
gr_budget[1] = total_md_bits - gr_budget[0];
362+
// clamp: neither granule gets less than 20% or more than 80%
363+
int lo = total_md_bits / 5;
364+
int hi = total_md_bits * 4 / 5;
365+
for (int gr = 0; gr < 2; gr++) {
366+
if (gr_budget[gr] < lo) {
367+
gr_budget[gr] = lo;
368+
}
369+
if (gr_budget[gr] > hi) {
370+
gr_budget[gr] = hi;
371+
}
372+
}
373+
// re-normalize after clamp
374+
int clamped_sum = gr_budget[0] + gr_budget[1];
375+
int bits_to_spread = total_md_bits - clamped_sum;
376+
gr_budget[0] += bits_to_spread / 2;
377+
gr_budget[1] += bits_to_spread - bits_to_spread / 2;
378+
} else {
379+
gr_budget[0] = total_md_bits / 2;
380+
gr_budget[1] = total_md_bits - gr_budget[0];
381+
}
382+
383+
int ix[2][2][576];
384+
int total_bits_used = 0;
385+
int intra_resv = 0;
386+
387+
for (int gr = 0; gr < 2; gr++) {
388+
int max_bits = gr_budget[gr] + intra_resv;
326389

327-
// Don't exceed remaining budget
390+
// don't exceed remaining budget
328391
int remaining_bits = total_md_bits - total_bits_used;
329392
if (max_bits > remaining_bits) {
330393
max_bits = remaining_bits;
@@ -335,17 +398,16 @@ static int mp3enc_encode_frame(mp3enc_t * enc, const float * pcm) {
335398

336399
int bits_per_ch = max_bits / nch;
337400

338-
// Step 5: quantize each channel with saved thresholds
401+
// quantize each channel with psy thresholds
339402
int gr_bits_used = 0;
340403
for (int ch = 0; ch < nch; ch++) {
341-
int bits = mp3enc_outer_loop(mdct_lr[ch], ix[gr][ch], si.gr[gr][ch], saved_xmin[ch], bits_per_ch, sfb_long,
342-
enc->sr_index, gr, si.scfsi[ch]);
404+
int bits = mp3enc_outer_loop(mdct_all[gr][ch], ix[gr][ch], si.gr[gr][ch], saved_xmin[gr][ch], bits_per_ch,
405+
sfb_long, enc->sr_index, gr, si.scfsi[ch]);
343406
gr_bits_used += bits;
344407
}
345408

346-
// Track intra-frame savings: bits not used by this granule
347-
// become available for the next granule in this frame.
348-
intra_resv += mean_bits - gr_bits_used;
409+
// intra-frame savings: unused bits carry to next granule
410+
intra_resv += gr_budget[gr] - gr_bits_used;
349411
if (intra_resv < 0) {
350412
intra_resv = 0;
351413
}

0 commit comments

Comments
 (0)