From e282e05173e9bf5dce91973dbb52080bdf232a65 Mon Sep 17 00:00:00 2001 From: jaspals Date: Mon, 6 Apr 2026 21:51:58 +0000 Subject: [PATCH 1/5] amdsmi update --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 5 +- .../plugins/inband/amdsmi/amdsmi_collector.py | 5 + .../plugins/inband/amdsmi/amdsmidata.py | 125 ++++++++++++++++++ .../plugins/inband/amdsmi/analyzer_args.py | 29 ++++ 4 files changed, 162 insertions(+), 2 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 815affdc..a8ba797e 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -661,8 +661,9 @@ def check_expected_xgmi_link_speed( if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0: self._log_event( category=EventCategory.IO, - description="Expected XGMI speed not configured, skipping XGMI link speed check", - priority=EventPriority.WARNING, + description=("Expected XGMI link speed not set; skipping XGMI link speed analysis"), + priority=EventPriority.INFO, + console_log=True, ) return diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 860c0e0f..9c988748 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -71,6 +71,7 @@ ValueUnit, XgmiLinks, XgmiMetrics, + build_amd_smi_analysis_ref, ) from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs from nodescraper.utils import get_exception_traceback @@ -475,6 +476,9 @@ def _get_amdsmi_data( return None try: + analysis_ref = build_amd_smi_analysis_ref( + statics, processes, partition, firmware, xgmi_metric + ) return AmdSmiDataModel( version=version, gpu_list=gpu_list, @@ -489,6 +493,7 @@ def _get_amdsmi_data( xgmi_link=xgmi_link or [], cper_data=cper_data, cper_afids=cper_afids, + analysis_ref=analysis_ref, ) except ValidationError as err: self.logger.warning("Validation err: %s", err) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 04ff545f..4f83eeb1 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -927,6 +927,25 @@ class Topo(BaseModel): links: list[TopoLink] +class AmdSmiAnalysisRef(BaseModel): + """Collector-filled summary for reference config""" + + model_config = ConfigDict(extra="forbid") + + gpu_processes_max: Optional[int] = None + max_power_w: Optional[int] = None + amdgpu_drv_version: Optional[str] = None + mem_part_mode: Optional[str] = None + compute_part_mode: Optional[str] = None + pldm_version: Optional[str] = None + ep_vendor_id: Optional[str] = None + ep_subvendor_id: Optional[str] = None + ep_device_id: Optional[str] = None + ep_subsystem_id: Optional[str] = None + ep_market_name: Optional[str] = None + xgmi_rates: Optional[list[float]] = None + + class AmdSmiDataModel(DataModel): """Data model for amd-smi data. @@ -957,6 +976,8 @@ class AmdSmiDataModel(DataModel): cper_data: Optional[list[FileModel]] = Field(default_factory=list) cper_afids: dict[str, int] = Field(default_factory=dict) + analysis_ref: Optional[AmdSmiAnalysisRef] = None + def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: """Get the gpu list item for the given gpu id.""" if self.gpu_list is None: @@ -1001,3 +1022,107 @@ def get_bad_pages(self, gpu: int) -> Optional[BadPages]: if item.gpu == gpu: return item return None + + +_PLDM_FW_ID = "PLDM_BUNDLE" + + +def build_amd_smi_analysis_ref( + static: Optional[list[AmdSmiStatic]], + process: Optional[list[Processes]], + partition: Optional[Partition], + firmware: Optional[list[Fw]], + xgmi_metric: Optional[list[XgmiMetrics]], +) -> AmdSmiAnalysisRef: + """Build analysis summary from collected structures (called by AmdSmiCollector).""" + static = static or [] + + gpu_processes_max: Optional[int] = None + if process: + counts: list[int] = [] + for proc in process: + if not proc.process_list: + continue + if isinstance(proc.process_list[0].process_info, str): + continue + counts.append(len(proc.process_list)) + if counts: + gpu_processes_max = max(counts) + + max_power_w: Optional[int] = None + for gpu in sorted(static, key=lambda s: s.gpu): + lim = gpu.limit + if lim is None or lim.max_power is None or lim.max_power.value is None: + continue + try: + max_power_w = int(float(lim.max_power.value)) + break + except (TypeError, ValueError): + continue + + amdgpu_drv_version: Optional[str] = None + for gpu in sorted(static, key=lambda s: s.gpu): + if gpu.driver and gpu.driver.version: + amdgpu_drv_version = gpu.driver.version + break + + mem_part_mode: Optional[str] = None + compute_part_mode: Optional[str] = None + if partition: + mps = partition.memory_partition + if mps: + mem_part_mode = sorted(mps, key=lambda p: p.gpu_id)[0].partition_type + cps = partition.compute_partition + if cps: + compute_part_mode = sorted(cps, key=lambda p: p.gpu_id)[0].partition_type + + pldm_version: Optional[str] = None + if firmware: + for fw in sorted(firmware, key=lambda f: f.gpu): + if isinstance(fw.fw_list, str): + continue + for item in fw.fw_list: + if item.fw_id == _PLDM_FW_ID: + pldm_version = item.fw_version + break + if pldm_version is not None: + break + + ep_vendor_id = ep_subvendor_id = ep_device_id = ep_subsystem_id = ep_market_name = None + if static: + first = sorted(static, key=lambda s: s.gpu)[0] + asic = first.asic + ep_vendor_id = asic.vendor_id + ep_subvendor_id = asic.subvendor_id + ep_device_id = asic.device_id + ep_subsystem_id = asic.subsystem_id + ep_market_name = asic.market_name + + xgmi_rates: Optional[list[float]] = None + if xgmi_metric: + rates: set[float] = set() + for xm in xgmi_metric: + br = xm.link_metrics.bit_rate + if br is None or br.value is None: + continue + try: + rates.add(float(br.value)) + except (TypeError, ValueError): + continue + if rates: + xgmi_rates = sorted(rates) + + return AmdSmiAnalysisRef( + gpu_processes_max=gpu_processes_max, + max_power_w=max_power_w, + amdgpu_drv_version=amdgpu_drv_version, + mem_part_mode=mem_part_mode, + compute_part_mode=compute_part_mode, + pldm_version=pldm_version, + ep_vendor_id=ep_vendor_id, + ep_subvendor_id=ep_subvendor_id, + ep_device_id=ep_device_id, + ep_subsystem_id=ep_subsystem_id, + ep_market_name=ep_market_name, + xgmi_rates=xgmi_rates, + ) diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index de9d0312..26248066 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -29,6 +29,7 @@ from pydantic import Field from nodescraper.models import AnalyzerArgs +from nodescraper.plugins.inband.amdsmi.amdsmidata import AmdSmiDataModel class AmdSmiAnalyzerArgs(AnalyzerArgs): @@ -80,3 +81,31 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): analysis_range_end: Optional[datetime] = Field( default=None, description="End of time range for time-windowed analysis." ) + + @classmethod + def build_from_model(cls, datamodel: AmdSmiDataModel) -> "AmdSmiAnalyzerArgs": + """Build analyzer args from data model (reference snapshot set by collector). + + Args: + datamodel (AmdSmiDataModel): data model for plugin + + Returns: + AmdSmiAnalyzerArgs: instance of analyzer args class + """ + r = datamodel.analysis_ref + if r is None: + return cls() + return cls( + expected_gpu_processes=r.gpu_processes_max, + expected_max_power=r.max_power_w, + expected_driver_version=r.amdgpu_drv_version, + expected_memory_partition_mode=r.mem_part_mode, + expected_compute_partition_mode=r.compute_part_mode, + expected_pldm_version=r.pldm_version, + vendorid_ep=r.ep_vendor_id, + vendorid_ep_vf=r.ep_subvendor_id, + devid_ep=r.ep_device_id, + devid_ep_vf=r.ep_subsystem_id, + sku_name=r.ep_market_name, + expected_xgmi_speed=r.xgmi_rates, + ) From 8cbed616bf801be13f625bfbf5fe8ace7b8cd412 Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 9 Apr 2026 10:27:03 -0500 Subject: [PATCH 2/5] expected-firmware-versions arg for matching any id with any version --- docs/PLUGIN_DOC.md | 4 +- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 57 ++++++++++--------- .../plugins/inband/amdsmi/amdsmi_collector.py | 8 ++- .../plugins/inband/amdsmi/amdsmidata.py | 38 +++++++++---- .../plugins/inband/amdsmi/analyzer_args.py | 12 +++- .../plugins/inband/amdsmi/collector_args.py | 4 ++ test/unit/plugin/test_amdsmi_analyzer.py | 57 ++++++++++++++++--- 7 files changed, 126 insertions(+), 54 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index d3d85377..ce4dec61 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -4,7 +4,7 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | -| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_pldm_version`: Optional[str] — Expected PLDM version string.
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | +| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | | BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | | CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | | DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | @@ -1818,7 +1818,7 @@ Check sysctl matches expected sysctl details - **expected_driver_version**: `Optional[str]` — Expected AMD driver version string. - **expected_memory_partition_mode**: `Optional[str]` — Expected memory partition mode (e.g. sp3, dp). - **expected_compute_partition_mode**: `Optional[str]` — Expected compute partition mode. -- **expected_pldm_version**: `Optional[str]` — Expected PLDM version string. +- **expected_firmware_versions**: `Optional[dict[str, str]]` — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE). - **l0_to_recovery_count_error_threshold**: `Optional[int]` — L0-to-recovery count above which an error is raised. - **l0_to_recovery_count_warning_threshold**: `Optional[int]` — L0-to-recovery count above which a warning is raised. - **vendorid_ep**: `Optional[str]` — Expected endpoint vendor ID (e.g. for PCIe). diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index a8ba797e..9a9cea71 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -534,18 +534,14 @@ def _format_static_mismatch_payload( "per_gpu": per_gpu_list, } - def check_pldm_version( + def check_firmware_versions( self, amdsmi_fw_data: Optional[list[Fw]], - expected_pldm_version: Optional[str], - ): - """Check expected pldm version - - Args: - amdsmi_fw_data (Optional[list[Fw]]): data model - expected_pldm_version (Optional[str]): expected pldm version - """ - PLDM_STRING = "PLDM_BUNDLE" + expected_firmware_versions: dict[str, str], + ) -> None: + """Check that each GPU reports the expected version for each ``fw_id``.""" + if not expected_firmware_versions: + return if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0: self._log_event( category=EventCategory.PLATFORM, @@ -554,30 +550,37 @@ def check_pldm_version( data={"amdsmi_fw_data": amdsmi_fw_data}, ) return - mismatched_gpus: list[int] = [] - pldm_missing_gpus: list[int] = [] + mismatches: list[dict[str, object]] = [] + missing: list[dict[str, object]] = [] for fw_data in amdsmi_fw_data: gpu = fw_data.gpu if isinstance(fw_data.fw_list, str): - pldm_missing_gpus.append(gpu) + for fw_id in expected_firmware_versions: + missing.append({"gpu": gpu, "fw_id": fw_id}) continue - for fw_info in fw_data.fw_list: - if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version: - mismatched_gpus.append(gpu) - if PLDM_STRING == fw_info.fw_id: - break - else: - pldm_missing_gpus.append(gpu) + actual_by_id = {item.fw_id: item.fw_version for item in fw_data.fw_list} + for fw_id, expected_ver in expected_firmware_versions.items(): + if fw_id not in actual_by_id: + missing.append({"gpu": gpu, "fw_id": fw_id}) + elif actual_by_id[fw_id] != expected_ver: + mismatches.append( + { + "gpu": gpu, + "fw_id": fw_id, + "expected": expected_ver, + "actual": actual_by_id[fw_id], + } + ) - if mismatched_gpus or pldm_missing_gpus: + if mismatches or missing: self._log_event( category=EventCategory.FW, - description="PLDM Version Mismatch", + description="Firmware version mismatch", priority=EventPriority.ERROR, data={ - "mismatched_gpus": mismatched_gpus, - "pldm_missing_gpus": pldm_missing_gpus, - "expected_pldm_version": expected_pldm_version, + "expected_firmware_versions": expected_firmware_versions, + "mismatches": mismatches, + "missing": missing, }, ) @@ -779,8 +782,8 @@ def analyze_data( args.expected_compute_partition_mode, ) - if args.expected_pldm_version: - self.check_pldm_version(data.firmware, args.expected_pldm_version) + if args.expected_firmware_versions: + self.check_firmware_versions(data.firmware, args.expected_firmware_versions) if data.cper_data: self.analyzer_cpers( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 9c988748..81a0049e 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -476,8 +476,14 @@ def _get_amdsmi_data( return None try: + fw_ids = args.analysis_firmware_ids if args and args.analysis_firmware_ids else None analysis_ref = build_amd_smi_analysis_ref( - statics, processes, partition, firmware, xgmi_metric + statics, + processes, + partition, + firmware, + xgmi_metric, + firmware_ids=fw_ids, ) return AmdSmiDataModel( version=version, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 4f83eeb1..ac64c329 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -937,6 +937,7 @@ class AmdSmiAnalysisRef(BaseModel): amdgpu_drv_version: Optional[str] = None mem_part_mode: Optional[str] = None compute_part_mode: Optional[str] = None + firmware_versions: Optional[dict[str, str]] = None pldm_version: Optional[str] = None ep_vendor_id: Optional[str] = None ep_subvendor_id: Optional[str] = None @@ -1024,7 +1025,25 @@ def get_bad_pages(self, gpu: int) -> Optional[BadPages]: return None -_PLDM_FW_ID = "PLDM_BUNDLE" +_DEFAULT_ANALYSIS_FW_IDS: tuple[str, ...] = ("PLDM_BUNDLE",) + + +def _first_observed_fw_versions(firmware: Optional[list[Fw]], fw_ids: list[str]) -> dict[str, str]: + """For each ``fw_id``, take the version from the lowest GPU index that reports it.""" + out: dict[str, str] = {} + if not firmware or not fw_ids: + return out + need = set(fw_ids) + for fw in sorted(firmware, key=lambda f: f.gpu): + if isinstance(fw.fw_list, str): + continue + for item in fw.fw_list: + if item.fw_id in need and item.fw_id not in out: + out[item.fw_id] = item.fw_version + need.discard(item.fw_id) + if not need: + break + return out def build_amd_smi_analysis_ref( @@ -1033,6 +1052,8 @@ def build_amd_smi_analysis_ref( partition: Optional[Partition], firmware: Optional[list[Fw]], xgmi_metric: Optional[list[XgmiMetrics]], + *, + firmware_ids: Optional[list[str]] = None, ) -> AmdSmiAnalysisRef: """Build analysis summary from collected structures (called by AmdSmiCollector).""" static = static or [] @@ -1076,17 +1097,9 @@ def build_amd_smi_analysis_ref( if cps: compute_part_mode = sorted(cps, key=lambda p: p.gpu_id)[0].partition_type - pldm_version: Optional[str] = None - if firmware: - for fw in sorted(firmware, key=lambda f: f.gpu): - if isinstance(fw.fw_list, str): - continue - for item in fw.fw_list: - if item.fw_id == _PLDM_FW_ID: - pldm_version = item.fw_version - break - if pldm_version is not None: - break + ids = list(firmware_ids) if firmware_ids is not None else list(_DEFAULT_ANALYSIS_FW_IDS) + firmware_versions = _first_observed_fw_versions(firmware, ids) or None + pldm_version = firmware_versions.get("PLDM_BUNDLE") if firmware_versions else None ep_vendor_id = ep_subvendor_id = ep_device_id = ep_subsystem_id = ep_market_name = None if static: @@ -1118,6 +1131,7 @@ def build_amd_smi_analysis_ref( amdgpu_drv_version=amdgpu_drv_version, mem_part_mode=mem_part_mode, compute_part_mode=compute_part_mode, + firmware_versions=firmware_versions, pldm_version=pldm_version, ep_vendor_id=ep_vendor_id, ep_subvendor_id=ep_subvendor_id, diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 26248066..3a5d2cfb 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -52,8 +52,9 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): expected_compute_partition_mode: Optional[str] = Field( default=None, description="Expected compute partition mode." ) - expected_pldm_version: Optional[str] = Field( - default=None, description="Expected PLDM version string." + expected_firmware_versions: Optional[dict[str, str]] = Field( + default=None, + description="Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).", ) l0_to_recovery_count_error_threshold: Optional[int] = Field( default=3, @@ -95,13 +96,18 @@ def build_from_model(cls, datamodel: AmdSmiDataModel) -> "AmdSmiAnalyzerArgs": r = datamodel.analysis_ref if r is None: return cls() + fw_expect: dict[str, str] = {} + if r.firmware_versions: + fw_expect.update(r.firmware_versions) + if r.pldm_version is not None and "PLDM_BUNDLE" not in fw_expect: + fw_expect["PLDM_BUNDLE"] = r.pldm_version return cls( expected_gpu_processes=r.gpu_processes_max, expected_max_power=r.max_power_w, expected_driver_version=r.amdgpu_drv_version, expected_memory_partition_mode=r.mem_part_mode, expected_compute_partition_mode=r.compute_part_mode, - expected_pldm_version=r.pldm_version, + expected_firmware_versions=dict(fw_expect) if fw_expect else None, vendorid_ep=r.ep_vendor_id, vendorid_ep_vf=r.ep_subvendor_id, devid_ep=r.ep_device_id, diff --git a/nodescraper/plugins/inband/amdsmi/collector_args.py b/nodescraper/plugins/inband/amdsmi/collector_args.py index 1a12d8d5..4fedc39b 100644 --- a/nodescraper/plugins/inband/amdsmi/collector_args.py +++ b/nodescraper/plugins/inband/amdsmi/collector_args.py @@ -33,6 +33,10 @@ class AmdSmiCollectorArgs(CollectorArgs): """Collector arguments for AmdSmiPlugin""" + analysis_firmware_ids: Optional[list[str]] = Field( + default=None, + description=("amd-smi fw_id values to record in analysis_ref.firmware_versions "), + ) cper_file_path: Optional[str] = Field( default=None, description="Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file).", diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py index 6bc40330..f3966c97 100644 --- a/test/unit/plugin/test_amdsmi_analyzer.py +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -461,8 +461,8 @@ def test_check_static_data_mismatch(mock_analyzer): assert len(analyzer.result.events) >= 1 -def test_check_pldm_version_success(mock_analyzer): - """Test check_pldm_version passes when PLDM version matches.""" +def test_check_firmware_versions_pldm_success(mock_analyzer): + """Test check_firmware_versions passes when PLDM version matches.""" analyzer = mock_analyzer firmware_data = [ @@ -474,13 +474,13 @@ def test_check_pldm_version_success(mock_analyzer): ), ] - analyzer.check_pldm_version(firmware_data, "1.2.3") + analyzer.check_firmware_versions(firmware_data, {"PLDM_BUNDLE": "1.2.3"}) assert len(analyzer.result.events) == 0 -def test_check_pldm_version_mismatch(mock_analyzer): - """Test check_pldm_version logs error when PLDM version doesn't match.""" +def test_check_firmware_versions_pldm_mismatch(mock_analyzer): + """Test check_firmware_versions logs error when PLDM version doesn't match.""" analyzer = mock_analyzer firmware_data = [ @@ -492,14 +492,14 @@ def test_check_pldm_version_mismatch(mock_analyzer): ), ] - analyzer.check_pldm_version(firmware_data, "1.2.4") + analyzer.check_firmware_versions(firmware_data, {"PLDM_BUNDLE": "1.2.4"}) assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR -def test_check_pldm_version_missing(mock_analyzer): - """Test check_pldm_version handles missing PLDM firmware.""" +def test_check_firmware_versions_pldm_missing(mock_analyzer): + """Test check_firmware_versions handles missing PLDM firmware.""" analyzer = mock_analyzer firmware_data = [ @@ -511,12 +511,51 @@ def test_check_pldm_version_missing(mock_analyzer): ), ] - analyzer.check_pldm_version(firmware_data, "1.2.3") + analyzer.check_firmware_versions(firmware_data, {"PLDM_BUNDLE": "1.2.3"}) assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR +def test_check_firmware_versions_multiple_fw_ids_success(mock_analyzer): + """Test check_firmware_versions passes when all fw_ids match on each GPU.""" + analyzer = mock_analyzer + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3"), + FwListItem(fw_id="OTHER_FW", fw_version="9.0"), + ], + ), + ] + analyzer.check_firmware_versions( + firmware_data, + {"PLDM_BUNDLE": "1.2.3", "OTHER_FW": "9.0"}, + ) + assert len(analyzer.result.events) == 0 + + +def test_check_firmware_versions_one_id_mismatch(mock_analyzer): + """Test check_firmware_versions errors when any fw_id version differs.""" + analyzer = mock_analyzer + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3"), + FwListItem(fw_id="OTHER_FW", fw_version="8.0"), + ], + ), + ] + analyzer.check_firmware_versions( + firmware_data, + {"PLDM_BUNDLE": "1.2.3", "OTHER_FW": "9.0"}, + ) + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + + def test_check_expected_memory_partition_mode_success(mock_analyzer): """Test check_expected_memory_partition_mode passes when partition modes match.""" analyzer = mock_analyzer From e7b82050fe48da219de0c5da0ac976ca88dc4eb7 Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 9 Apr 2026 16:12:03 +0000 Subject: [PATCH 3/5] collector fix --- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 81a0049e..19e561cd 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -774,7 +774,9 @@ def get_firmware(self) -> Optional[list[Fw]]: normalized: list[FwListItem] = [] for e in fw_list_raw: if isinstance(e, dict): - fid = e.get("fw_name") + fid = e.get("fw_id") + if fid is None: + fid = e.get("fw_name") ver = e.get("fw_version") normalized.append( FwListItem( From 3883cb7cf9a18c2978f84ab53ddefba607002168 Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 9 Apr 2026 16:28:55 +0000 Subject: [PATCH 4/5] build_amd_smi_analysis_ref method fix --- .../plugins/inband/amdsmi/amdsmi_collector.py | 3 +- .../plugins/inband/amdsmi/amdsmidata.py | 193 +++++++++--------- 2 files changed, 98 insertions(+), 98 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 19e561cd..927a75a8 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -71,7 +71,6 @@ ValueUnit, XgmiLinks, XgmiMetrics, - build_amd_smi_analysis_ref, ) from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs from nodescraper.utils import get_exception_traceback @@ -477,7 +476,7 @@ def _get_amdsmi_data( try: fw_ids = args.analysis_firmware_ids if args and args.analysis_firmware_ids else None - analysis_ref = build_amd_smi_analysis_ref( + analysis_ref = AmdSmiDataModel.build_analysis_ref( statics, processes, partition, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index ac64c329..c5ab3562 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1024,6 +1024,103 @@ def get_bad_pages(self, gpu: int) -> Optional[BadPages]: return item return None + @classmethod + def build_analysis_ref( + cls, + static: Optional[list[AmdSmiStatic]], + process: Optional[list[Processes]], + partition: Optional[Partition], + firmware: Optional[list[Fw]], + xgmi_metric: Optional[list[XgmiMetrics]], + *, + firmware_ids: Optional[list[str]] = None, + ) -> AmdSmiAnalysisRef: + """Build ``AmdSmiAnalysisRef`` from collected fields (used when constructing this model).""" + static = static or [] + + gpu_processes_max: Optional[int] = None + if process: + counts: list[int] = [] + for proc in process: + if not proc.process_list: + continue + if isinstance(proc.process_list[0].process_info, str): + continue + counts.append(len(proc.process_list)) + if counts: + gpu_processes_max = max(counts) + + max_power_w: Optional[int] = None + for gpu in sorted(static, key=lambda s: s.gpu): + lim = gpu.limit + if lim is None or lim.max_power is None or lim.max_power.value is None: + continue + try: + max_power_w = int(float(lim.max_power.value)) + break + except (TypeError, ValueError): + continue + + amdgpu_drv_version: Optional[str] = None + for gpu in sorted(static, key=lambda s: s.gpu): + if gpu.driver and gpu.driver.version: + amdgpu_drv_version = gpu.driver.version + break + + mem_part_mode: Optional[str] = None + compute_part_mode: Optional[str] = None + if partition: + mps = partition.memory_partition + if mps: + mem_part_mode = sorted(mps, key=lambda p: p.gpu_id)[0].partition_type + cps = partition.compute_partition + if cps: + compute_part_mode = sorted(cps, key=lambda p: p.gpu_id)[0].partition_type + + ids = list(firmware_ids) if firmware_ids is not None else list(_DEFAULT_ANALYSIS_FW_IDS) + firmware_versions = _first_observed_fw_versions(firmware, ids) or None + pldm_version = firmware_versions.get("PLDM_BUNDLE") if firmware_versions else None + + ep_vendor_id = ep_subvendor_id = ep_device_id = ep_subsystem_id = ep_market_name = None + if static: + first = sorted(static, key=lambda s: s.gpu)[0] + asic = first.asic + ep_vendor_id = asic.vendor_id + ep_subvendor_id = asic.subvendor_id + ep_device_id = asic.device_id + ep_subsystem_id = asic.subsystem_id + ep_market_name = asic.market_name + + xgmi_rates: Optional[list[float]] = None + if xgmi_metric: + rates: set[float] = set() + for xm in xgmi_metric: + br = xm.link_metrics.bit_rate + if br is None or br.value is None: + continue + try: + rates.add(float(br.value)) + except (TypeError, ValueError): + continue + if rates: + xgmi_rates = sorted(rates) + + return AmdSmiAnalysisRef( + gpu_processes_max=gpu_processes_max, + max_power_w=max_power_w, + amdgpu_drv_version=amdgpu_drv_version, + mem_part_mode=mem_part_mode, + compute_part_mode=compute_part_mode, + firmware_versions=firmware_versions, + pldm_version=pldm_version, + ep_vendor_id=ep_vendor_id, + ep_subvendor_id=ep_subvendor_id, + ep_device_id=ep_device_id, + ep_subsystem_id=ep_subsystem_id, + ep_market_name=ep_market_name, + xgmi_rates=xgmi_rates, + ) + _DEFAULT_ANALYSIS_FW_IDS: tuple[str, ...] = ("PLDM_BUNDLE",) @@ -1044,99 +1141,3 @@ def _first_observed_fw_versions(firmware: Optional[list[Fw]], fw_ids: list[str]) if not need: break return out - - -def build_amd_smi_analysis_ref( - static: Optional[list[AmdSmiStatic]], - process: Optional[list[Processes]], - partition: Optional[Partition], - firmware: Optional[list[Fw]], - xgmi_metric: Optional[list[XgmiMetrics]], - *, - firmware_ids: Optional[list[str]] = None, -) -> AmdSmiAnalysisRef: - """Build analysis summary from collected structures (called by AmdSmiCollector).""" - static = static or [] - - gpu_processes_max: Optional[int] = None - if process: - counts: list[int] = [] - for proc in process: - if not proc.process_list: - continue - if isinstance(proc.process_list[0].process_info, str): - continue - counts.append(len(proc.process_list)) - if counts: - gpu_processes_max = max(counts) - - max_power_w: Optional[int] = None - for gpu in sorted(static, key=lambda s: s.gpu): - lim = gpu.limit - if lim is None or lim.max_power is None or lim.max_power.value is None: - continue - try: - max_power_w = int(float(lim.max_power.value)) - break - except (TypeError, ValueError): - continue - - amdgpu_drv_version: Optional[str] = None - for gpu in sorted(static, key=lambda s: s.gpu): - if gpu.driver and gpu.driver.version: - amdgpu_drv_version = gpu.driver.version - break - - mem_part_mode: Optional[str] = None - compute_part_mode: Optional[str] = None - if partition: - mps = partition.memory_partition - if mps: - mem_part_mode = sorted(mps, key=lambda p: p.gpu_id)[0].partition_type - cps = partition.compute_partition - if cps: - compute_part_mode = sorted(cps, key=lambda p: p.gpu_id)[0].partition_type - - ids = list(firmware_ids) if firmware_ids is not None else list(_DEFAULT_ANALYSIS_FW_IDS) - firmware_versions = _first_observed_fw_versions(firmware, ids) or None - pldm_version = firmware_versions.get("PLDM_BUNDLE") if firmware_versions else None - - ep_vendor_id = ep_subvendor_id = ep_device_id = ep_subsystem_id = ep_market_name = None - if static: - first = sorted(static, key=lambda s: s.gpu)[0] - asic = first.asic - ep_vendor_id = asic.vendor_id - ep_subvendor_id = asic.subvendor_id - ep_device_id = asic.device_id - ep_subsystem_id = asic.subsystem_id - ep_market_name = asic.market_name - - xgmi_rates: Optional[list[float]] = None - if xgmi_metric: - rates: set[float] = set() - for xm in xgmi_metric: - br = xm.link_metrics.bit_rate - if br is None or br.value is None: - continue - try: - rates.add(float(br.value)) - except (TypeError, ValueError): - continue - if rates: - xgmi_rates = sorted(rates) - - return AmdSmiAnalysisRef( - gpu_processes_max=gpu_processes_max, - max_power_w=max_power_w, - amdgpu_drv_version=amdgpu_drv_version, - mem_part_mode=mem_part_mode, - compute_part_mode=compute_part_mode, - firmware_versions=firmware_versions, - pldm_version=pldm_version, - ep_vendor_id=ep_vendor_id, - ep_subvendor_id=ep_subvendor_id, - ep_device_id=ep_device_id, - ep_subsystem_id=ep_subsystem_id, - ep_market_name=ep_market_name, - xgmi_rates=xgmi_rates, - ) From 72b5a27e7f1de8349000ff91f308fd9e9d007ae9 Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 9 Apr 2026 16:42:51 +0000 Subject: [PATCH 5/5] model fix --- .../plugins/inband/amdsmi/amdsmi_collector.py | 14 +- .../plugins/inband/amdsmi/amdsmidata.py | 210 ++++++++++-------- 2 files changed, 127 insertions(+), 97 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 927a75a8..d4f22c46 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -476,15 +476,7 @@ def _get_amdsmi_data( try: fw_ids = args.analysis_firmware_ids if args and args.analysis_firmware_ids else None - analysis_ref = AmdSmiDataModel.build_analysis_ref( - statics, - processes, - partition, - firmware, - xgmi_metric, - firmware_ids=fw_ids, - ) - return AmdSmiDataModel( + base = AmdSmiDataModel( version=version, gpu_list=gpu_list, process=processes, @@ -498,8 +490,10 @@ def _get_amdsmi_data( xgmi_link=xgmi_link or [], cper_data=cper_data, cper_afids=cper_afids, - analysis_ref=analysis_ref, + analysis_firmware_ids=fw_ids, + analysis_ref=None, ) + return base.model_copy(update={"analysis_ref": base.build_analysis_ref()}) except ValidationError as err: self.logger.warning("Validation err: %s", err) self._log_event( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index c5ab3562..940047ba 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -930,8 +930,6 @@ class Topo(BaseModel): class AmdSmiAnalysisRef(BaseModel): """Collector-filled summary for reference config""" - model_config = ConfigDict(extra="forbid") - gpu_processes_max: Optional[int] = None max_power_w: Optional[int] = None amdgpu_drv_version: Optional[str] = None @@ -977,6 +975,11 @@ class AmdSmiDataModel(DataModel): cper_data: Optional[list[FileModel]] = Field(default_factory=list) cper_afids: dict[str, int] = Field(default_factory=dict) + analysis_firmware_ids: Optional[list[str]] = Field( + default=None, + description="fw_id values used when snapshotting firmware_versions into analysis_ref.", + ) + analysis_ref: Optional[AmdSmiAnalysisRef] = None def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: @@ -1024,101 +1027,134 @@ def get_bad_pages(self, gpu: int) -> Optional[BadPages]: return item return None - @classmethod - def build_analysis_ref( - cls, - static: Optional[list[AmdSmiStatic]], - process: Optional[list[Processes]], - partition: Optional[Partition], - firmware: Optional[list[Fw]], - xgmi_metric: Optional[list[XgmiMetrics]], - *, - firmware_ids: Optional[list[str]] = None, - ) -> AmdSmiAnalysisRef: - """Build ``AmdSmiAnalysisRef`` from collected fields (used when constructing this model).""" - static = static or [] - - gpu_processes_max: Optional[int] = None - if process: - counts: list[int] = [] - for proc in process: - if not proc.process_list: - continue - if isinstance(proc.process_list[0].process_info, str): - continue - counts.append(len(proc.process_list)) - if counts: - gpu_processes_max = max(counts) - - max_power_w: Optional[int] = None - for gpu in sorted(static, key=lambda s: s.gpu): + def _sorted_static_gpus(self) -> list[AmdSmiStatic]: + return sorted(self.static or [], key=lambda s: s.gpu) + + @property + def ref_gpu_processes_max(self) -> Optional[int]: + """Max process-list length across GPUs (for analysis reference snapshot).""" + proc = self.process + if not proc: + return None + counts: list[int] = [] + for p in proc: + if not p.process_list: + continue + if isinstance(p.process_list[0].process_info, str): + continue + counts.append(len(p.process_list)) + return max(counts) if counts else None + + @property + def ref_max_power_w(self) -> Optional[int]: + """First available max power limit (W) from static data, lowest GPU index first.""" + for gpu in self._sorted_static_gpus(): lim = gpu.limit if lim is None or lim.max_power is None or lim.max_power.value is None: continue try: - max_power_w = int(float(lim.max_power.value)) - break + return int(float(lim.max_power.value)) except (TypeError, ValueError): continue + return None - amdgpu_drv_version: Optional[str] = None - for gpu in sorted(static, key=lambda s: s.gpu): + @property + def ref_amdgpu_drv_version(self) -> Optional[str]: + """Driver version from the lowest-index GPU with static data.""" + for gpu in self._sorted_static_gpus(): if gpu.driver and gpu.driver.version: - amdgpu_drv_version = gpu.driver.version - break - - mem_part_mode: Optional[str] = None - compute_part_mode: Optional[str] = None - if partition: - mps = partition.memory_partition - if mps: - mem_part_mode = sorted(mps, key=lambda p: p.gpu_id)[0].partition_type - cps = partition.compute_partition - if cps: - compute_part_mode = sorted(cps, key=lambda p: p.gpu_id)[0].partition_type - - ids = list(firmware_ids) if firmware_ids is not None else list(_DEFAULT_ANALYSIS_FW_IDS) - firmware_versions = _first_observed_fw_versions(firmware, ids) or None - pldm_version = firmware_versions.get("PLDM_BUNDLE") if firmware_versions else None - - ep_vendor_id = ep_subvendor_id = ep_device_id = ep_subsystem_id = ep_market_name = None - if static: - first = sorted(static, key=lambda s: s.gpu)[0] - asic = first.asic - ep_vendor_id = asic.vendor_id - ep_subvendor_id = asic.subvendor_id - ep_device_id = asic.device_id - ep_subsystem_id = asic.subsystem_id - ep_market_name = asic.market_name - - xgmi_rates: Optional[list[float]] = None - if xgmi_metric: - rates: set[float] = set() - for xm in xgmi_metric: - br = xm.link_metrics.bit_rate - if br is None or br.value is None: - continue - try: - rates.add(float(br.value)) - except (TypeError, ValueError): - continue - if rates: - xgmi_rates = sorted(rates) + return gpu.driver.version + return None + + @property + def ref_mem_part_mode(self) -> Optional[str]: + if self.partition is None: + return None + mps = self.partition.memory_partition + if not mps: + return None + return sorted(mps, key=lambda p: p.gpu_id)[0].partition_type + + @property + def ref_compute_part_mode(self) -> Optional[str]: + if self.partition is None: + return None + cps = self.partition.compute_partition + if not cps: + return None + return sorted(cps, key=lambda p: p.gpu_id)[0].partition_type + + @property + def ref_firmware_versions(self) -> Optional[dict[str, str]]: + ids = ( + list(self.analysis_firmware_ids) + if self.analysis_firmware_ids is not None + else list(_DEFAULT_ANALYSIS_FW_IDS) + ) + return _first_observed_fw_versions(self.firmware, ids) or None + + @property + def ref_pldm_version(self) -> Optional[str]: + fw = self.ref_firmware_versions + return fw.get("PLDM_BUNDLE") if fw else None + + @property + def ref_ep_vendor_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.vendor_id if ss else None + + @property + def ref_ep_subvendor_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.subvendor_id if ss else None + + @property + def ref_ep_device_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.device_id if ss else None + + @property + def ref_ep_subsystem_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.subsystem_id if ss else None + + @property + def ref_ep_market_name(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.market_name if ss else None + + @property + def ref_xgmi_rates(self) -> Optional[list[float]]: + xm = self.xgmi_metric + if not xm: + return None + rates: set[float] = set() + for m in xm: + br = m.link_metrics.bit_rate + if br is None or br.value is None: + continue + try: + rates.add(float(br.value)) + except (TypeError, ValueError): + continue + return sorted(rates) if rates else None + def build_analysis_ref(self) -> AmdSmiAnalysisRef: + """Build ``AmdSmiAnalysisRef`` from current field values""" return AmdSmiAnalysisRef( - gpu_processes_max=gpu_processes_max, - max_power_w=max_power_w, - amdgpu_drv_version=amdgpu_drv_version, - mem_part_mode=mem_part_mode, - compute_part_mode=compute_part_mode, - firmware_versions=firmware_versions, - pldm_version=pldm_version, - ep_vendor_id=ep_vendor_id, - ep_subvendor_id=ep_subvendor_id, - ep_device_id=ep_device_id, - ep_subsystem_id=ep_subsystem_id, - ep_market_name=ep_market_name, - xgmi_rates=xgmi_rates, + gpu_processes_max=self.ref_gpu_processes_max, + max_power_w=self.ref_max_power_w, + amdgpu_drv_version=self.ref_amdgpu_drv_version, + mem_part_mode=self.ref_mem_part_mode, + compute_part_mode=self.ref_compute_part_mode, + firmware_versions=self.ref_firmware_versions, + pldm_version=self.ref_pldm_version, + ep_vendor_id=self.ref_ep_vendor_id, + ep_subvendor_id=self.ref_ep_subvendor_id, + ep_device_id=self.ref_ep_device_id, + ep_subsystem_id=self.ref_ep_subsystem_id, + ep_market_name=self.ref_ep_market_name, + xgmi_rates=self.ref_xgmi_rates, )