From ae5c013c238d21ac56579bae19c4bd51aea86778 Mon Sep 17 00:00:00 2001 From: Matt Liberty Date: Thu, 9 Apr 2026 17:37:17 +0000 Subject: [PATCH] [gpl] Include Liberty internal power in MBFF clustering cost The MBFF algorithm previously used only leakage power to decide whether to replace single-bit flip-flops with multi-bit cells. For flip-flops, internal power dominates total power, and MBFF cells share scan (SE/SI) and clock structures across bits, giving 40-90% savings on those pins. Ignoring internal power caused clustering to increase total power on some PDKs. Add getInternalEnergy() which sums average internal energy across all pins (CK, D, Q, SE, SI) from Liberty internal_power tables. Use this alongside leakage in three places: - SetRatios: norm_power_ uses total estimated power (leakage + internal_energy * clock_activity) so the ILP cost function reflects total power, not just leakage. - ReadLibs: select best tray per size by minimum total estimated power instead of minimum leakage, so cells with lower total power (e.g. SVT over LVT) are preferred even when their leakage is higher. - SetVars: select the single-bit baseline cell by lowest total estimated power, with both leakage and internal energy paired from the same cell. Clock period is obtained from SDC before ReadLibs runs so tray selection can account for internal power. Falls back to leakage-only when no clock is defined or no internal_power tables exist. Signed-off-by: Matt Liberty --- src/gpl/src/mbff.cpp | 346 ++++++++++++++++++++++++-------- src/gpl/src/mbff.h | 20 ++ src/gpl/test/mbff_orig_name.ok | 18 +- src/gpl/test/mbff_orig_name.tcl | 2 +- 4 files changed, 290 insertions(+), 96 deletions(-) diff --git a/src/gpl/src/mbff.cpp b/src/gpl/src/mbff.cpp index dd3c15bc0fc..6d11a1ec033 100644 --- a/src/gpl/src/mbff.cpp +++ b/src/gpl/src/mbff.cpp @@ -2178,20 +2178,129 @@ float MBFF::getLeakage(odb::dbMaster* master) return cell_leakage; } +float MBFF::getInternalEnergy(odb::dbInst* inst) +{ + odb::dbMaster* master = inst->getMaster(); + sta::Cell* cell = network_->dbToSta(master); + sta::LibertyCell* lib_cell = network_->libertyCell(cell); + sta::LibertyCell* corner_cell + = lib_cell->sceneCell(corner_, sta::MinMax::max()); + if (!corner_cell) { + return 0.0; + } + + // Sum average internal energy across all pins (CK, D, Q, SE, SI, ...). + // For each pin, when conditions partition the input states; we average + // across all groups (uniform duty assumption). This captures the full + // cell energy profile -- MBFF cells share SE/SI/CK structures across + // bits, giving substantial savings that clock-pin-only analysis misses. + float total_energy = 0.0; + for (odb::dbITerm* iterm : inst->getITerms()) { + if (IsSupplyPin(iterm)) { + continue; + } + const sta::Pin* pin = network_->dbToSta(iterm); + const sta::LibertyPort* port = network_->libertyPort(pin); + if (!port) { + continue; + } + const sta::LibertyPort* scene_port + = port->scenePort(corner_, sta::MinMax::max()); + if (!scene_port) { + continue; + } + float port_energy_sum = 0.0; + int group_count = 0; + for (const sta::InternalPower* pwr : + corner_cell->internalPowers(scene_port)) { + float energy = 0.0; + int rf_count = 0; + for (const sta::RiseFall* rf : sta::RiseFall::range()) { + const sta::InternalPowerModel& model = pwr->model(rf); + const sta::TableModel* tbl = model.model(); + if (!tbl) { + continue; + } + float v1 = 0, v2 = 0; + if (tbl->axis1()) { + v1 = (tbl->axis1()->min() + tbl->axis1()->max()) / 2.0f; + } + if (tbl->axis2()) { + v2 = (tbl->axis2()->min() + tbl->axis2()->max()) / 2.0f; + } + energy += tbl->findValue(v1, v2, 0.0f); + rf_count++; + } + if (rf_count > 0) { + port_energy_sum += energy / rf_count; + group_count++; + } + } + if (group_count > 0) { + const float pin_energy = port_energy_sum / group_count; + total_energy += pin_energy; + debugPrint(log_, + GPL, + "mbff", + 2, + " pin {} groups={} energy={}", + port->name(), + group_count, + pin_energy); + } + } + return total_energy; +} + +float MBFF::clockActivity() const +{ + return (clock_period_ > 0) ? (2.0 / clock_period_) : 0.0; +} + +float MBFF::getClockPeriod(odb::dbInst* ff_inst) +{ + float period = 0.0; + for (odb::dbITerm* iterm : ff_inst->getITerms()) { + if (IsClockPin(iterm)) { + const sta::Pin* sta_pin = network_->dbToSta(iterm); + for (const sta::Clock* clk : sta_->clocks(sta_pin, corner_->mode())) { + if (period == 0.0 || clk->period() < period) { + period = clk->period(); + } + } + break; + } + } + return period; +} + void MBFF::SetVars(const std::vector& flops) { // get min height and width single_bit_height_ = std::numeric_limits::max(); single_bit_width_ = std::numeric_limits::max(); single_bit_power_ = std::numeric_limits::max(); + const float activity = clockActivity(); + std::map energy_cache; for (const Flop& flop : flops) { dbMaster* master = insts_[flop.idx]->getMaster(); single_bit_height_ = std::min(single_bit_height_, master->getHeight() / multiplier_); single_bit_width_ = std::min(single_bit_width_, master->getWidth() / multiplier_); - const float leakage = getLeakage(insts_[flop.idx]->getMaster()); - single_bit_power_ = std::min(single_bit_power_, leakage); + auto [it, inserted] = energy_cache.try_emplace(master, 0.0f); + if (inserted) { + it->second = getInternalEnergy(insts_[flop.idx]); + } + const float leakage = getLeakage(master); + const float total_power = leakage + it->second * activity; + // Select the single-bit cell with lowest total estimated power as + // the baseline. Both leakage and internal energy must come from the + // same cell to avoid an artificially low baseline. + if (total_power < single_bit_power_) { + single_bit_power_ = total_power; + single_bit_master_ = master; + } } } @@ -2202,6 +2311,18 @@ void MBFF::SetRatios(const Mask& array_mask) norm_power_.clear(); norm_power_.push_back(1.00); + const float activity = clockActivity(); + + debugPrint(log_, + GPL, + "mbff", + 1, + "mask: {} sb_cell: {} sb_power: {} clock_period: {}", + array_mask.to_string(), + single_bit_master_ ? single_bit_master_->getName() : "none", + single_bit_power_, + clock_period_); + for (int i = 1; i < num_sizes_; i++) { norm_area_.push_back(std::numeric_limits::max()); norm_power_.push_back(std::numeric_limits::max()); @@ -2210,8 +2331,24 @@ void MBFF::SetRatios(const Mask& array_mask) norm_area_[i] = (tray_area_[array_mask][i] / (single_bit_height_ * single_bit_width_)) / slot_cnt; - norm_power_[i] - = (tray_power_[array_mask][i] / slot_cnt) / single_bit_power_; + if (single_bit_power_ > 0) { + const float tray_total + = tray_power_[array_mask][i] + + tray_internal_energy_[array_mask][i] * activity; + norm_power_[i] = (tray_total / slot_cnt) / single_bit_power_; + debugPrint(log_, + GPL, + "mbff", + 1, + " {}-bit {}: tray_leakage: {} tray_internal_energy: {} " + "tray_total: {} norm_power: {}", + slot_cnt, + best_master_[array_mask][i]->getName(), + tray_power_[array_mask][i], + tray_internal_energy_[array_mask][i], + tray_total, + norm_power_[i]); + } } } } @@ -2285,8 +2422,8 @@ void MBFF::Run(const int mx_sz, const float alpha, const float beta) for (int i = 0; i < num_chunks; i++) { dbInst* ff_inst = insts_[FFs[i].back().idx]; const Mask array_mask = GetArrayMask(ff_inst, false); - // do we even have trays to cluster these flops? - if (best_master_[array_mask].empty()) { + // do we even have tray candidates to cluster these flops? + if (!tray_candidates_.contains(array_mask)) { tot_ilp += (alpha * FFs[i].size()); tray_sizes_used_[1] += FFs[i].size(); log_->info(GPL, @@ -2297,6 +2434,8 @@ void MBFF::Run(const int mx_sz, const float alpha, const float beta) continue; } any_found = true; + clock_period_ = getClockPeriod(ff_inst); + SelectBestTrays(array_mask, clockActivity()); SetVars(FFs[i]); SetRatios(array_mask); tot_ilp += RunClustering(FFs[i], mx_sz, alpha, beta, array_mask); @@ -2383,105 +2522,138 @@ void MBFF::ReadLibs() const int idx = GetBitIdx(num_slots); const Mask array_mask = GetArrayMask(tmp_tray, true); - if (best_master_[array_mask].empty()) { - best_master_[array_mask].resize(num_sizes_, nullptr); - tray_area_[array_mask].resize(num_sizes_, - std::numeric_limits::max()); - tray_power_[array_mask].resize(num_sizes_, - std::numeric_limits::max()); - tray_width_[array_mask].resize(num_sizes_); - pin_mappings_[array_mask].resize(num_sizes_); - - slot_to_tray_x_[array_mask].resize(num_sizes_); - slot_to_tray_y_[array_mask].resize(num_sizes_); + if (tray_candidates_[array_mask].empty()) { + tray_candidates_[array_mask].resize(num_sizes_); } const float cur_area = (master->getHeight() / multiplier_) * (master->getWidth() / multiplier_); const float cur_leakage = getLeakage(tmp_tray->getMaster()); + const float cur_internal_energy = getInternalEnergy(tmp_tray); debugPrint(log_, GPL, "mbff", 1, - "Found tray {} mask: {} area: {} leakage power: {}", + "Found tray {} mask: {} area: {} leakage: {} " + "internal_energy: {}", master->getName(), array_mask.to_string(), cur_area, - cur_leakage); - - if (std::tie(tray_power_[array_mask][idx], tray_area_[array_mask][idx]) - > std::tie(cur_leakage, cur_area)) { - tray_area_[array_mask][idx] = cur_area; - tray_power_[array_mask][idx] = cur_leakage; - best_master_[array_mask][idx] = master; - pin_mappings_[array_mask][idx] = GetPinMapping(tmp_tray); - tray_width_[array_mask][idx] = master->getWidth() / multiplier_; - - // save slot info - tmp_tray->setLocation(0, 0); - tmp_tray->setPlacementStatus(odb::dbPlacementStatus::PLACED); - - slot_to_tray_x_[array_mask][idx].clear(); - slot_to_tray_y_[array_mask][idx].clear(); - - std::vector d; - std::vector q; - std::vector qn; - - for (const auto& p : pin_mappings_[array_mask][idx]) { - dbITerm* d_pin = tmp_tray->findITerm(p.first->name().c_str()); - dbITerm* q_pin - = (p.second.q ? tmp_tray->findITerm(p.second.q->name().c_str()) - : nullptr); - dbITerm* qn_pin - = (p.second.qn ? tmp_tray->findITerm(p.second.qn->name().c_str()) - : nullptr); - - d.push_back(Point{ - d_pin->getBBox().xCenter() / multiplier_, - d_pin->getBBox().yCenter() / multiplier_, + cur_leakage, + cur_internal_energy); + + // Collect slot geometry from the temporary instance. + tmp_tray->setLocation(0, 0); + tmp_tray->setPlacementStatus(odb::dbPlacementStatus::PLACED); + + DataToOutputsMap pin_mapping = GetPinMapping(tmp_tray); + + std::vector d; + std::vector q; + std::vector qn; + + for (const auto& p : pin_mapping) { + dbITerm* d_pin = tmp_tray->findITerm(p.first->name().c_str()); + dbITerm* q_pin + = (p.second.q ? tmp_tray->findITerm(p.second.q->name().c_str()) + : nullptr); + dbITerm* qn_pin + = (p.second.qn ? tmp_tray->findITerm(p.second.qn->name().c_str()) + : nullptr); + + d.push_back(Point{ + d_pin->getBBox().xCenter() / multiplier_, + d_pin->getBBox().yCenter() / multiplier_, + }); + + if (q_pin) { + q.push_back(Point{ + q_pin->getBBox().xCenter() / multiplier_, + q_pin->getBBox().yCenter() / multiplier_, }); + } - if (q_pin) { - q.push_back(Point{ - q_pin->getBBox().xCenter() / multiplier_, - q_pin->getBBox().yCenter() / multiplier_, - }); - } - - if (qn_pin) { - qn.push_back(Point{ - qn_pin->getBBox().xCenter() / multiplier_, - qn_pin->getBBox().yCenter() / multiplier_, - }); - } + if (qn_pin) { + qn.push_back(Point{ + qn_pin->getBBox().xCenter() / multiplier_, + qn_pin->getBBox().yCenter() / multiplier_, + }); } + } - // slots w.r.t. bottom-left corner - for (int i = 0; i < num_slots; i++) { - if (!q.empty() && !qn.empty()) { - slot_to_tray_x_[array_mask][idx].push_back( - (std::max(d[i].x, std::max(q[i].x, qn[i].x)) - + std::min(d[i].x, std::min(q[i].x, qn[i].x))) - / 2.0); - slot_to_tray_y_[array_mask][idx].push_back( - (std::max(d[i].y, std::max(q[i].y, qn[i].y)) - + std::min(d[i].y, std::min(q[i].y, qn[i].y))) - / 2.0); - } else if (!q.empty()) { - slot_to_tray_x_[array_mask][idx].push_back( - (std::max(d[i].x, q[i].x) + std::min(d[i].x, q[i].x)) / 2.0); - slot_to_tray_y_[array_mask][idx].push_back( - (std::max(d[i].y, q[i].y) + std::min(d[i].y, q[i].y)) / 2.0); - } else { - slot_to_tray_x_[array_mask][idx].push_back( - (std::max(d[i].x, qn[i].x) + std::min(d[i].x, qn[i].x)) / 2.0); - slot_to_tray_y_[array_mask][idx].push_back( - (std::max(d[i].y, qn[i].y) + std::min(d[i].y, qn[i].y)) / 2.0); - } + std::vector slot_x; + std::vector slot_y; + for (int i = 0; i < num_slots; i++) { + if (!q.empty() && !qn.empty()) { + slot_x.push_back((std::max(d[i].x, std::max(q[i].x, qn[i].x)) + + std::min(d[i].x, std::min(q[i].x, qn[i].x))) + / 2.0); + slot_y.push_back((std::max(d[i].y, std::max(q[i].y, qn[i].y)) + + std::min(d[i].y, std::min(q[i].y, qn[i].y))) + / 2.0); + } else if (!q.empty()) { + slot_x.push_back((d[i].x + q[i].x) / 2.0); + slot_y.push_back((d[i].y + q[i].y) / 2.0); + } else { + slot_x.push_back((d[i].x + qn[i].x) / 2.0); + slot_y.push_back((d[i].y + qn[i].y) / 2.0); } } + + tray_candidates_[array_mask][idx].push_back( + TrayCandidate{master, + cur_area, + cur_leakage, + cur_internal_energy, + master->getWidth() / multiplier_, + std::move(pin_mapping), + std::move(slot_x), + std::move(slot_y)}); + } + } +} + +void MBFF::SelectBestTrays(const Mask& mask, const float activity) +{ + auto it = tray_candidates_.find(mask); + if (it == tray_candidates_.end()) { + return; + } + const auto& candidates_per_size = it->second; + + best_master_[mask].assign(num_sizes_, nullptr); + tray_area_[mask].assign(num_sizes_, std::numeric_limits::max()); + tray_power_[mask].assign(num_sizes_, std::numeric_limits::max()); + tray_internal_energy_[mask].assign(num_sizes_, 0.0); + tray_width_[mask].assign(num_sizes_, 0.0f); + pin_mappings_[mask].assign(num_sizes_, DataToOutputsMap{}); + slot_to_tray_x_[mask].assign(num_sizes_, {}); + slot_to_tray_y_[mask].assign(num_sizes_, {}); + + for (int idx = 0; idx < num_sizes_; idx++) { + const TrayCandidate* best = nullptr; + float best_total_power = std::numeric_limits::max(); + float best_area = std::numeric_limits::max(); + for (const TrayCandidate& cand : candidates_per_size[idx]) { + const float cur_total_power + = cand.leakage + cand.internal_energy * activity; + if (std::tie(best_total_power, best_area) + > std::tie(cur_total_power, cand.area)) { + best = &cand; + best_total_power = cur_total_power; + best_area = cand.area; + } + } + if (best) { + best_master_[mask][idx] = best->master; + tray_area_[mask][idx] = best->area; + tray_power_[mask][idx] = best->leakage; + tray_internal_energy_[mask][idx] = best->internal_energy; + tray_width_[mask][idx] = best->width; + pin_mappings_[mask][idx] = best->pin_mapping; + slot_to_tray_x_[mask][idx] = best->slot_x; + slot_to_tray_y_[mask][idx] = best->slot_y; } } } @@ -2597,6 +2769,8 @@ MBFF::MBFF(odb::dbDatabase* db, single_bit_height_(0.0), single_bit_width_(0.0), single_bit_power_(0.0), + clock_period_(0.0), + single_bit_master_(nullptr), test_idx_(-1) { graphics_->setDebugOn(debug_graphics); diff --git a/src/gpl/src/mbff.h b/src/gpl/src/mbff.h index 3279094881c..13e76eed369 100644 --- a/src/gpl/src/mbff.h +++ b/src/gpl/src/mbff.h @@ -94,6 +94,18 @@ class MBFF = std::map; DataToOutputsMap GetPinMapping(odb::dbInst* tray); + struct TrayCandidate + { + odb::dbMaster* master; + float area; + float leakage; + float internal_energy; + float width; + DataToOutputsMap pin_mapping; + std::vector slot_x; + std::vector slot_y; + }; + // MBFF functions const sta::LibertyCell* getLibertyCell(const sta::Cell* cell); float GetDist(const Point& a, const Point& b); @@ -225,12 +237,16 @@ class MBFF void ReadFFs(); void ReadPaths(); void ReadLibs(); + void SelectBestTrays(const Mask& mask, float activity); void SetTrayNames(); void displayFlopClusters(const char* stage, std::vector>& clusters); float getLeakage(odb::dbMaster* master); + float getInternalEnergy(odb::dbInst* inst); + float clockActivity() const; + float getClockPeriod(odb::dbInst* ff_inst); // OpenROAD vars odb::dbDatabase* db_; @@ -254,6 +270,8 @@ class MBFF float single_bit_height_; float single_bit_width_; float single_bit_power_; + float clock_period_; + odb::dbMaster* single_bit_master_; // launch-capture FF-pair vars std::map name_to_idx_; @@ -270,9 +288,11 @@ class MBFF ArrayMaskVector pin_mappings_; ArrayMaskVector tray_area_; ArrayMaskVector tray_power_; + ArrayMaskVector tray_internal_energy_; ArrayMaskVector tray_width_; ArrayMaskVector> slot_to_tray_x_; ArrayMaskVector> slot_to_tray_y_; + ArrayMaskVector> tray_candidates_; std::vector norm_area_; std::vector norm_power_; std::vector unused_; diff --git a/src/gpl/test/mbff_orig_name.ok b/src/gpl/test/mbff_orig_name.ok index 6e25ca9640e..40f414e2c39 100644 --- a/src/gpl/test/mbff_orig_name.ok +++ b/src/gpl/test/mbff_orig_name.ok @@ -10,14 +10,14 @@ [INFO ODB-0131] Created 4 components and 20 component-terminals. [INFO ODB-0133] Created 9 nets and 12 connections. Alpha = 40.0, Beta = 1.0, #paths = 0, max size = -1 -Total ILP Cost: 97.228 +Total ILP Cost: 112.643 Total Timing Critical Path Displacement: 0.0 -Average slot-to-flop displacement: 0.865 -Final Objective Value: 97.228 +Average slot-to-flop displacement: 1.730 +Final Objective Value: 112.643 Sizes used - 2-bit: 2 -Startpoint: d1 (input port clocked by clk) -Endpoint: _tray_size2_7 (rising edge-triggered flip-flop clocked by clk) + 4-bit: 1 +Startpoint: d3 (input port clocked by clk) +Endpoint: _tray_size4_7 (rising edge-triggered flip-flop clocked by clk) Path Group: clk Path Type: max @@ -26,14 +26,14 @@ Path Type: max 0.00 0.00 clock clk (rise edge) 0.00 0.00 clock network delay (ideal) 0.00 0.00 ^ input external delay - 0.00 0.00 ^ d1 (in) - 0.00 0.00 ^ _tray_size2_7/D1 (DFFHQNV2Xx1_ASAP7_75t_L) ff1/D + 0.00 0.00 ^ d3 (in) + 0.00 0.00 ^ _tray_size4_7/D1 (DFFHQNV4Xx1_ASAP7_75t_L) ff3/D 0.00 data arrival time 1000.00 1000.00 clock clk (rise edge) 0.00 1000.00 clock network delay (ideal) 0.00 1000.00 clock reconvergence pessimism - 1000.00 ^ _tray_size2_7/CLK (DFFHQNV2Xx1_ASAP7_75t_L) + 1000.00 ^ _tray_size4_7/CLK (DFFHQNV4Xx1_ASAP7_75t_L) -22.99 977.01 library setup time 977.01 data required time --------------------------------------------------------------------------------------------- diff --git a/src/gpl/test/mbff_orig_name.tcl b/src/gpl/test/mbff_orig_name.tcl index 6b3957b1645..69d7969da32 100644 --- a/src/gpl/test/mbff_orig_name.tcl +++ b/src/gpl/test/mbff_orig_name.tcl @@ -26,4 +26,4 @@ cluster_flops -tray_weight 40.0 \ # Report timing to verify original FF names appear in the path report. # After clustering the tray pin descriptions should show in the Orig Name column. -report_checks -path_delay max -fields {orig_name} -through [get_pins _tray_size2_7/D1] +report_checks -path_delay max -fields {orig_name} -through [get_pins _tray_size4_7/D1]