Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/PLUGIN_DOC.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

| Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer |
| --- | --- | --- | --- | --- | --- | --- |
| AmdSmiPlugin | bad-pages<br>firmware --json<br>list --json<br>metric -g all<br>partition --json<br>process --json<br>ras --cper --folder={folder}<br>ras --afid --cper-file {cper_file}<br>static -g all --json<br>static -g {gpu_id} --json<br>topology<br>version --json<br>xgmi -l<br>xgmi -m | **Analyzer Args:**<br>- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).<br>- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.<br>- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).<br>- `expected_driver_version`: Optional[str] — Expected AMD driver version string.<br>- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).<br>- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.<br>- `expected_pldm_version`: Optional[str] — Expected PLDM version string.<br>- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.<br>- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.<br>- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).<br>- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.<br>- `devid_ep`: Optional[str] — Expected endpoint device ID.<br>- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.<br>- `sku_name`: Optional[str] — Expected SKU name string for GPU.<br>- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).<br>- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.<br>- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**<br>- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) |
| AmdSmiPlugin | bad-pages<br>firmware --json<br>list --json<br>metric -g all<br>partition --json<br>process --json<br>ras --cper --folder={folder}<br>ras --afid --cper-file {cper_file}<br>static -g all --json<br>static -g {gpu_id} --json<br>topology<br>version --json<br>xgmi -l<br>xgmi -m | **Analyzer Args:**<br>- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).<br>- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.<br>- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).<br>- `expected_driver_version`: Optional[str] — Expected AMD driver version string.<br>- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).<br>- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.<br>- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).<br>- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.<br>- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.<br>- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).<br>- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.<br>- `devid_ep`: Optional[str] — Expected endpoint device ID.<br>- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.<br>- `sku_name`: Optional[str] — Expected SKU name string for GPU.<br>- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).<br>- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.<br>- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**<br>- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) |
| BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'<br>wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**<br>- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).<br>- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) |
| CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**<br>- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').<br>- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.<br>- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).<br>- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) |
| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor &#124; Measure-Object).Count"<br>lspci -d {vendorid_ep}: &#124; grep -i 'VGA\&#124;Display\&#124;3D' &#124; wc -l<br>powershell -Command "(wmic path win32_VideoController get name &#124; findstr AMD &#124; Measure-Object).Count"<br>lscpu<br>lshw<br>lspci -d {vendorid_ep}: &#124; grep -i 'Virtual Function' &#124; wc -l<br>powershell -Command "(Get-VMHostPartitionableGpu &#124; Measure-Object).Count" | **Analyzer Args:**<br>- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.<br>- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.<br>- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) |
Expand Down Expand Up @@ -1843,7 +1843,7 @@ Check sysctl matches expected sysctl details
- **expected_driver_version**: `Optional[str]` — Expected AMD driver version string.
- **expected_memory_partition_mode**: `Optional[str]` — Expected memory partition mode (e.g. sp3, dp).
- **expected_compute_partition_mode**: `Optional[str]` — Expected compute partition mode.
- **expected_pldm_version**: `Optional[str]` — Expected PLDM version string.
- **expected_firmware_versions**: `Optional[dict[str, str]]` — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- **l0_to_recovery_count_error_threshold**: `Optional[int]` — L0-to-recovery count above which an error is raised.
- **l0_to_recovery_count_warning_threshold**: `Optional[int]` — L0-to-recovery count above which a warning is raised.
- **vendorid_ep**: `Optional[str]` — Expected endpoint vendor ID (e.g. for PCIe).
Expand Down
62 changes: 33 additions & 29 deletions nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,18 +534,14 @@ def _format_static_mismatch_payload(
"per_gpu": per_gpu_list,
}

def check_pldm_version(
def check_firmware_versions(
self,
amdsmi_fw_data: Optional[list[Fw]],
expected_pldm_version: Optional[str],
):
"""Check expected pldm version

Args:
amdsmi_fw_data (Optional[list[Fw]]): data model
expected_pldm_version (Optional[str]): expected pldm version
"""
PLDM_STRING = "PLDM_BUNDLE"
expected_firmware_versions: dict[str, str],
) -> None:
"""Check that each GPU reports the expected version for each ``fw_id``."""
if not expected_firmware_versions:
return
if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0:
self._log_event(
category=EventCategory.PLATFORM,
Expand All @@ -554,30 +550,37 @@ def check_pldm_version(
data={"amdsmi_fw_data": amdsmi_fw_data},
)
return
mismatched_gpus: list[int] = []
pldm_missing_gpus: list[int] = []
mismatches: list[dict[str, object]] = []
missing: list[dict[str, object]] = []
for fw_data in amdsmi_fw_data:
gpu = fw_data.gpu
if isinstance(fw_data.fw_list, str):
pldm_missing_gpus.append(gpu)
for fw_id in expected_firmware_versions:
missing.append({"gpu": gpu, "fw_id": fw_id})
continue
for fw_info in fw_data.fw_list:
if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version:
mismatched_gpus.append(gpu)
if PLDM_STRING == fw_info.fw_id:
break
else:
pldm_missing_gpus.append(gpu)
actual_by_id = {item.fw_id: item.fw_version for item in fw_data.fw_list}
for fw_id, expected_ver in expected_firmware_versions.items():
if fw_id not in actual_by_id:
missing.append({"gpu": gpu, "fw_id": fw_id})
elif actual_by_id[fw_id] != expected_ver:
mismatches.append(
{
"gpu": gpu,
"fw_id": fw_id,
"expected": expected_ver,
"actual": actual_by_id[fw_id],
}
)

if mismatched_gpus or pldm_missing_gpus:
if mismatches or missing:
self._log_event(
category=EventCategory.FW,
description="PLDM Version Mismatch",
description="Firmware version mismatch",
priority=EventPriority.ERROR,
data={
"mismatched_gpus": mismatched_gpus,
"pldm_missing_gpus": pldm_missing_gpus,
"expected_pldm_version": expected_pldm_version,
"expected_firmware_versions": expected_firmware_versions,
"mismatches": mismatches,
"missing": missing,
},
)

Expand Down Expand Up @@ -661,8 +664,9 @@ def check_expected_xgmi_link_speed(
if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0:
self._log_event(
category=EventCategory.IO,
description="Expected XGMI speed not configured, skipping XGMI link speed check",
priority=EventPriority.WARNING,
description=("Expected XGMI link speed not set; skipping XGMI link speed analysis"),
priority=EventPriority.INFO,
console_log=True,
)
return

Expand Down Expand Up @@ -778,8 +782,8 @@ def analyze_data(
args.expected_compute_partition_mode,
)

if args.expected_pldm_version:
self.check_pldm_version(data.firmware, args.expected_pldm_version)
if args.expected_firmware_versions:
self.check_firmware_versions(data.firmware, args.expected_firmware_versions)

if data.cper_data:
self.analyzer_cpers(
Expand Down
10 changes: 8 additions & 2 deletions nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,8 @@ def _get_amdsmi_data(
return None

try:
return AmdSmiDataModel(
fw_ids = args.analysis_firmware_ids if args and args.analysis_firmware_ids else None
base = AmdSmiDataModel(
version=version,
gpu_list=gpu_list,
process=processes,
Expand All @@ -489,7 +490,10 @@ def _get_amdsmi_data(
xgmi_link=xgmi_link or [],
cper_data=cper_data,
cper_afids=cper_afids,
analysis_firmware_ids=fw_ids,
analysis_ref=None,
)
return base.model_copy(update={"analysis_ref": base.build_analysis_ref()})
except ValidationError as err:
self.logger.warning("Validation err: %s", err)
self._log_event(
Expand Down Expand Up @@ -763,7 +767,9 @@ def get_firmware(self) -> Optional[list[Fw]]:
normalized: list[FwListItem] = []
for e in fw_list_raw:
if isinstance(e, dict):
fid = e.get("fw_name")
fid = e.get("fw_id")
if fid is None:
fid = e.get("fw_name")
ver = e.get("fw_version")
normalized.append(
FwListItem(
Expand Down
Loading
Loading