diff --git a/src/gpl/src/mbff.cpp b/src/gpl/src/mbff.cpp index dd3c15bc0fc..6d11a1ec033 100644 --- a/src/gpl/src/mbff.cpp +++ b/src/gpl/src/mbff.cpp @@ -2178,20 +2178,129 @@ float MBFF::getLeakage(odb::dbMaster* master) return cell_leakage; } +float MBFF::getInternalEnergy(odb::dbInst* inst) +{ + odb::dbMaster* master = inst->getMaster(); + sta::Cell* cell = network_->dbToSta(master); + sta::LibertyCell* lib_cell = network_->libertyCell(cell); + sta::LibertyCell* corner_cell + = lib_cell->sceneCell(corner_, sta::MinMax::max()); + if (!corner_cell) { + return 0.0; + } + + // Sum average internal energy across all pins (CK, D, Q, SE, SI, ...). + // For each pin, when conditions partition the input states; we average + // across all groups (uniform duty assumption). This captures the full + // cell energy profile -- MBFF cells share SE/SI/CK structures across + // bits, giving substantial savings that clock-pin-only analysis misses. + float total_energy = 0.0; + for (odb::dbITerm* iterm : inst->getITerms()) { + if (IsSupplyPin(iterm)) { + continue; + } + const sta::Pin* pin = network_->dbToSta(iterm); + const sta::LibertyPort* port = network_->libertyPort(pin); + if (!port) { + continue; + } + const sta::LibertyPort* scene_port + = port->scenePort(corner_, sta::MinMax::max()); + if (!scene_port) { + continue; + } + float port_energy_sum = 0.0; + int group_count = 0; + for (const sta::InternalPower* pwr : + corner_cell->internalPowers(scene_port)) { + float energy = 0.0; + int rf_count = 0; + for (const sta::RiseFall* rf : sta::RiseFall::range()) { + const sta::InternalPowerModel& model = pwr->model(rf); + const sta::TableModel* tbl = model.model(); + if (!tbl) { + continue; + } + float v1 = 0, v2 = 0; + if (tbl->axis1()) { + v1 = (tbl->axis1()->min() + tbl->axis1()->max()) / 2.0f; + } + if (tbl->axis2()) { + v2 = (tbl->axis2()->min() + tbl->axis2()->max()) / 2.0f; + } + energy += tbl->findValue(v1, v2, 0.0f); + rf_count++; + } + if (rf_count > 0) { + port_energy_sum += energy / rf_count; + group_count++; + } + } + if (group_count > 0) { + const float pin_energy = port_energy_sum / group_count; + total_energy += pin_energy; + debugPrint(log_, + GPL, + "mbff", + 2, + " pin {} groups={} energy={}", + port->name(), + group_count, + pin_energy); + } + } + return total_energy; +} + +float MBFF::clockActivity() const +{ + return (clock_period_ > 0) ? (2.0 / clock_period_) : 0.0; +} + +float MBFF::getClockPeriod(odb::dbInst* ff_inst) +{ + float period = 0.0; + for (odb::dbITerm* iterm : ff_inst->getITerms()) { + if (IsClockPin(iterm)) { + const sta::Pin* sta_pin = network_->dbToSta(iterm); + for (const sta::Clock* clk : sta_->clocks(sta_pin, corner_->mode())) { + if (period == 0.0 || clk->period() < period) { + period = clk->period(); + } + } + break; + } + } + return period; +} + void MBFF::SetVars(const std::vector& flops) { // get min height and width single_bit_height_ = std::numeric_limits::max(); single_bit_width_ = std::numeric_limits::max(); single_bit_power_ = std::numeric_limits::max(); + const float activity = clockActivity(); + std::map energy_cache; for (const Flop& flop : flops) { dbMaster* master = insts_[flop.idx]->getMaster(); single_bit_height_ = std::min(single_bit_height_, master->getHeight() / multiplier_); single_bit_width_ = std::min(single_bit_width_, master->getWidth() / multiplier_); - const float leakage = getLeakage(insts_[flop.idx]->getMaster()); - single_bit_power_ = std::min(single_bit_power_, leakage); + auto [it, inserted] = energy_cache.try_emplace(master, 0.0f); + if (inserted) { + it->second = getInternalEnergy(insts_[flop.idx]); + } + const float leakage = getLeakage(master); + const float total_power = leakage + it->second * activity; + // Select the single-bit cell with lowest total estimated power as + // the baseline. Both leakage and internal energy must come from the + // same cell to avoid an artificially low baseline. + if (total_power < single_bit_power_) { + single_bit_power_ = total_power; + single_bit_master_ = master; + } } } @@ -2202,6 +2311,18 @@ void MBFF::SetRatios(const Mask& array_mask) norm_power_.clear(); norm_power_.push_back(1.00); + const float activity = clockActivity(); + + debugPrint(log_, + GPL, + "mbff", + 1, + "mask: {} sb_cell: {} sb_power: {} clock_period: {}", + array_mask.to_string(), + single_bit_master_ ? single_bit_master_->getName() : "none", + single_bit_power_, + clock_period_); + for (int i = 1; i < num_sizes_; i++) { norm_area_.push_back(std::numeric_limits::max()); norm_power_.push_back(std::numeric_limits::max()); @@ -2210,8 +2331,24 @@ void MBFF::SetRatios(const Mask& array_mask) norm_area_[i] = (tray_area_[array_mask][i] / (single_bit_height_ * single_bit_width_)) / slot_cnt; - norm_power_[i] - = (tray_power_[array_mask][i] / slot_cnt) / single_bit_power_; + if (single_bit_power_ > 0) { + const float tray_total + = tray_power_[array_mask][i] + + tray_internal_energy_[array_mask][i] * activity; + norm_power_[i] = (tray_total / slot_cnt) / single_bit_power_; + debugPrint(log_, + GPL, + "mbff", + 1, + " {}-bit {}: tray_leakage: {} tray_internal_energy: {} " + "tray_total: {} norm_power: {}", + slot_cnt, + best_master_[array_mask][i]->getName(), + tray_power_[array_mask][i], + tray_internal_energy_[array_mask][i], + tray_total, + norm_power_[i]); + } } } } @@ -2285,8 +2422,8 @@ void MBFF::Run(const int mx_sz, const float alpha, const float beta) for (int i = 0; i < num_chunks; i++) { dbInst* ff_inst = insts_[FFs[i].back().idx]; const Mask array_mask = GetArrayMask(ff_inst, false); - // do we even have trays to cluster these flops? - if (best_master_[array_mask].empty()) { + // do we even have tray candidates to cluster these flops? + if (!tray_candidates_.contains(array_mask)) { tot_ilp += (alpha * FFs[i].size()); tray_sizes_used_[1] += FFs[i].size(); log_->info(GPL, @@ -2297,6 +2434,8 @@ void MBFF::Run(const int mx_sz, const float alpha, const float beta) continue; } any_found = true; + clock_period_ = getClockPeriod(ff_inst); + SelectBestTrays(array_mask, clockActivity()); SetVars(FFs[i]); SetRatios(array_mask); tot_ilp += RunClustering(FFs[i], mx_sz, alpha, beta, array_mask); @@ -2383,105 +2522,138 @@ void MBFF::ReadLibs() const int idx = GetBitIdx(num_slots); const Mask array_mask = GetArrayMask(tmp_tray, true); - if (best_master_[array_mask].empty()) { - best_master_[array_mask].resize(num_sizes_, nullptr); - tray_area_[array_mask].resize(num_sizes_, - std::numeric_limits::max()); - tray_power_[array_mask].resize(num_sizes_, - std::numeric_limits::max()); - tray_width_[array_mask].resize(num_sizes_); - pin_mappings_[array_mask].resize(num_sizes_); - - slot_to_tray_x_[array_mask].resize(num_sizes_); - slot_to_tray_y_[array_mask].resize(num_sizes_); + if (tray_candidates_[array_mask].empty()) { + tray_candidates_[array_mask].resize(num_sizes_); } const float cur_area = (master->getHeight() / multiplier_) * (master->getWidth() / multiplier_); const float cur_leakage = getLeakage(tmp_tray->getMaster()); + const float cur_internal_energy = getInternalEnergy(tmp_tray); debugPrint(log_, GPL, "mbff", 1, - "Found tray {} mask: {} area: {} leakage power: {}", + "Found tray {} mask: {} area: {} leakage: {} " + "internal_energy: {}", master->getName(), array_mask.to_string(), cur_area, - cur_leakage); - - if (std::tie(tray_power_[array_mask][idx], tray_area_[array_mask][idx]) - > std::tie(cur_leakage, cur_area)) { - tray_area_[array_mask][idx] = cur_area; - tray_power_[array_mask][idx] = cur_leakage; - best_master_[array_mask][idx] = master; - pin_mappings_[array_mask][idx] = GetPinMapping(tmp_tray); - tray_width_[array_mask][idx] = master->getWidth() / multiplier_; - - // save slot info - tmp_tray->setLocation(0, 0); - tmp_tray->setPlacementStatus(odb::dbPlacementStatus::PLACED); - - slot_to_tray_x_[array_mask][idx].clear(); - slot_to_tray_y_[array_mask][idx].clear(); - - std::vector d; - std::vector q; - std::vector qn; - - for (const auto& p : pin_mappings_[array_mask][idx]) { - dbITerm* d_pin = tmp_tray->findITerm(p.first->name().c_str()); - dbITerm* q_pin - = (p.second.q ? tmp_tray->findITerm(p.second.q->name().c_str()) - : nullptr); - dbITerm* qn_pin - = (p.second.qn ? tmp_tray->findITerm(p.second.qn->name().c_str()) - : nullptr); - - d.push_back(Point{ - d_pin->getBBox().xCenter() / multiplier_, - d_pin->getBBox().yCenter() / multiplier_, + cur_leakage, + cur_internal_energy); + + // Collect slot geometry from the temporary instance. + tmp_tray->setLocation(0, 0); + tmp_tray->setPlacementStatus(odb::dbPlacementStatus::PLACED); + + DataToOutputsMap pin_mapping = GetPinMapping(tmp_tray); + + std::vector d; + std::vector q; + std::vector qn; + + for (const auto& p : pin_mapping) { + dbITerm* d_pin = tmp_tray->findITerm(p.first->name().c_str()); + dbITerm* q_pin + = (p.second.q ? tmp_tray->findITerm(p.second.q->name().c_str()) + : nullptr); + dbITerm* qn_pin + = (p.second.qn ? tmp_tray->findITerm(p.second.qn->name().c_str()) + : nullptr); + + d.push_back(Point{ + d_pin->getBBox().xCenter() / multiplier_, + d_pin->getBBox().yCenter() / multiplier_, + }); + + if (q_pin) { + q.push_back(Point{ + q_pin->getBBox().xCenter() / multiplier_, + q_pin->getBBox().yCenter() / multiplier_, }); + } - if (q_pin) { - q.push_back(Point{ - q_pin->getBBox().xCenter() / multiplier_, - q_pin->getBBox().yCenter() / multiplier_, - }); - } - - if (qn_pin) { - qn.push_back(Point{ - qn_pin->getBBox().xCenter() / multiplier_, - qn_pin->getBBox().yCenter() / multiplier_, - }); - } + if (qn_pin) { + qn.push_back(Point{ + qn_pin->getBBox().xCenter() / multiplier_, + qn_pin->getBBox().yCenter() / multiplier_, + }); } + } - // slots w.r.t. bottom-left corner - for (int i = 0; i < num_slots; i++) { - if (!q.empty() && !qn.empty()) { - slot_to_tray_x_[array_mask][idx].push_back( - (std::max(d[i].x, std::max(q[i].x, qn[i].x)) - + std::min(d[i].x, std::min(q[i].x, qn[i].x))) - / 2.0); - slot_to_tray_y_[array_mask][idx].push_back( - (std::max(d[i].y, std::max(q[i].y, qn[i].y)) - + std::min(d[i].y, std::min(q[i].y, qn[i].y))) - / 2.0); - } else if (!q.empty()) { - slot_to_tray_x_[array_mask][idx].push_back( - (std::max(d[i].x, q[i].x) + std::min(d[i].x, q[i].x)) / 2.0); - slot_to_tray_y_[array_mask][idx].push_back( - (std::max(d[i].y, q[i].y) + std::min(d[i].y, q[i].y)) / 2.0); - } else { - slot_to_tray_x_[array_mask][idx].push_back( - (std::max(d[i].x, qn[i].x) + std::min(d[i].x, qn[i].x)) / 2.0); - slot_to_tray_y_[array_mask][idx].push_back( - (std::max(d[i].y, qn[i].y) + std::min(d[i].y, qn[i].y)) / 2.0); - } + std::vector slot_x; + std::vector slot_y; + for (int i = 0; i < num_slots; i++) { + if (!q.empty() && !qn.empty()) { + slot_x.push_back((std::max(d[i].x, std::max(q[i].x, qn[i].x)) + + std::min(d[i].x, std::min(q[i].x, qn[i].x))) + / 2.0); + slot_y.push_back((std::max(d[i].y, std::max(q[i].y, qn[i].y)) + + std::min(d[i].y, std::min(q[i].y, qn[i].y))) + / 2.0); + } else if (!q.empty()) { + slot_x.push_back((d[i].x + q[i].x) / 2.0); + slot_y.push_back((d[i].y + q[i].y) / 2.0); + } else { + slot_x.push_back((d[i].x + qn[i].x) / 2.0); + slot_y.push_back((d[i].y + qn[i].y) / 2.0); } } + + tray_candidates_[array_mask][idx].push_back( + TrayCandidate{master, + cur_area, + cur_leakage, + cur_internal_energy, + master->getWidth() / multiplier_, + std::move(pin_mapping), + std::move(slot_x), + std::move(slot_y)}); + } + } +} + +void MBFF::SelectBestTrays(const Mask& mask, const float activity) +{ + auto it = tray_candidates_.find(mask); + if (it == tray_candidates_.end()) { + return; + } + const auto& candidates_per_size = it->second; + + best_master_[mask].assign(num_sizes_, nullptr); + tray_area_[mask].assign(num_sizes_, std::numeric_limits::max()); + tray_power_[mask].assign(num_sizes_, std::numeric_limits::max()); + tray_internal_energy_[mask].assign(num_sizes_, 0.0); + tray_width_[mask].assign(num_sizes_, 0.0f); + pin_mappings_[mask].assign(num_sizes_, DataToOutputsMap{}); + slot_to_tray_x_[mask].assign(num_sizes_, {}); + slot_to_tray_y_[mask].assign(num_sizes_, {}); + + for (int idx = 0; idx < num_sizes_; idx++) { + const TrayCandidate* best = nullptr; + float best_total_power = std::numeric_limits::max(); + float best_area = std::numeric_limits::max(); + for (const TrayCandidate& cand : candidates_per_size[idx]) { + const float cur_total_power + = cand.leakage + cand.internal_energy * activity; + if (std::tie(best_total_power, best_area) + > std::tie(cur_total_power, cand.area)) { + best = &cand; + best_total_power = cur_total_power; + best_area = cand.area; + } + } + if (best) { + best_master_[mask][idx] = best->master; + tray_area_[mask][idx] = best->area; + tray_power_[mask][idx] = best->leakage; + tray_internal_energy_[mask][idx] = best->internal_energy; + tray_width_[mask][idx] = best->width; + pin_mappings_[mask][idx] = best->pin_mapping; + slot_to_tray_x_[mask][idx] = best->slot_x; + slot_to_tray_y_[mask][idx] = best->slot_y; } } } @@ -2597,6 +2769,8 @@ MBFF::MBFF(odb::dbDatabase* db, single_bit_height_(0.0), single_bit_width_(0.0), single_bit_power_(0.0), + clock_period_(0.0), + single_bit_master_(nullptr), test_idx_(-1) { graphics_->setDebugOn(debug_graphics); diff --git a/src/gpl/src/mbff.h b/src/gpl/src/mbff.h index 3279094881c..13e76eed369 100644 --- a/src/gpl/src/mbff.h +++ b/src/gpl/src/mbff.h @@ -94,6 +94,18 @@ class MBFF = std::map; DataToOutputsMap GetPinMapping(odb::dbInst* tray); + struct TrayCandidate + { + odb::dbMaster* master; + float area; + float leakage; + float internal_energy; + float width; + DataToOutputsMap pin_mapping; + std::vector slot_x; + std::vector slot_y; + }; + // MBFF functions const sta::LibertyCell* getLibertyCell(const sta::Cell* cell); float GetDist(const Point& a, const Point& b); @@ -225,12 +237,16 @@ class MBFF void ReadFFs(); void ReadPaths(); void ReadLibs(); + void SelectBestTrays(const Mask& mask, float activity); void SetTrayNames(); void displayFlopClusters(const char* stage, std::vector>& clusters); float getLeakage(odb::dbMaster* master); + float getInternalEnergy(odb::dbInst* inst); + float clockActivity() const; + float getClockPeriod(odb::dbInst* ff_inst); // OpenROAD vars odb::dbDatabase* db_; @@ -254,6 +270,8 @@ class MBFF float single_bit_height_; float single_bit_width_; float single_bit_power_; + float clock_period_; + odb::dbMaster* single_bit_master_; // launch-capture FF-pair vars std::map name_to_idx_; @@ -270,9 +288,11 @@ class MBFF ArrayMaskVector pin_mappings_; ArrayMaskVector tray_area_; ArrayMaskVector tray_power_; + ArrayMaskVector tray_internal_energy_; ArrayMaskVector tray_width_; ArrayMaskVector> slot_to_tray_x_; ArrayMaskVector> slot_to_tray_y_; + ArrayMaskVector> tray_candidates_; std::vector norm_area_; std::vector norm_power_; std::vector unused_; diff --git a/src/gpl/test/mbff_orig_name.ok b/src/gpl/test/mbff_orig_name.ok index 6e25ca9640e..40f414e2c39 100644 --- a/src/gpl/test/mbff_orig_name.ok +++ b/src/gpl/test/mbff_orig_name.ok @@ -10,14 +10,14 @@ [INFO ODB-0131] Created 4 components and 20 component-terminals. [INFO ODB-0133] Created 9 nets and 12 connections. Alpha = 40.0, Beta = 1.0, #paths = 0, max size = -1 -Total ILP Cost: 97.228 +Total ILP Cost: 112.643 Total Timing Critical Path Displacement: 0.0 -Average slot-to-flop displacement: 0.865 -Final Objective Value: 97.228 +Average slot-to-flop displacement: 1.730 +Final Objective Value: 112.643 Sizes used - 2-bit: 2 -Startpoint: d1 (input port clocked by clk) -Endpoint: _tray_size2_7 (rising edge-triggered flip-flop clocked by clk) + 4-bit: 1 +Startpoint: d3 (input port clocked by clk) +Endpoint: _tray_size4_7 (rising edge-triggered flip-flop clocked by clk) Path Group: clk Path Type: max @@ -26,14 +26,14 @@ Path Type: max 0.00 0.00 clock clk (rise edge) 0.00 0.00 clock network delay (ideal) 0.00 0.00 ^ input external delay - 0.00 0.00 ^ d1 (in) - 0.00 0.00 ^ _tray_size2_7/D1 (DFFHQNV2Xx1_ASAP7_75t_L) ff1/D + 0.00 0.00 ^ d3 (in) + 0.00 0.00 ^ _tray_size4_7/D1 (DFFHQNV4Xx1_ASAP7_75t_L) ff3/D 0.00 data arrival time 1000.00 1000.00 clock clk (rise edge) 0.00 1000.00 clock network delay (ideal) 0.00 1000.00 clock reconvergence pessimism - 1000.00 ^ _tray_size2_7/CLK (DFFHQNV2Xx1_ASAP7_75t_L) + 1000.00 ^ _tray_size4_7/CLK (DFFHQNV4Xx1_ASAP7_75t_L) -22.99 977.01 library setup time 977.01 data required time --------------------------------------------------------------------------------------------- diff --git a/src/gpl/test/mbff_orig_name.tcl b/src/gpl/test/mbff_orig_name.tcl index 6b3957b1645..69d7969da32 100644 --- a/src/gpl/test/mbff_orig_name.tcl +++ b/src/gpl/test/mbff_orig_name.tcl @@ -26,4 +26,4 @@ cluster_flops -tray_weight 40.0 \ # Report timing to verify original FF names appear in the path report. # After clustering the tray pin descriptions should show in the Orig Name column. -report_checks -path_delay max -fields {orig_name} -through [get_pins _tray_size2_7/D1] +report_checks -path_delay max -fields {orig_name} -through [get_pins _tray_size4_7/D1]