| // SPDX-License-Identifier: MIT |
| /* |
| * Copyright © 2025 Intel Corporation |
| */ |
| |
| #include "xe_survivability_mode.h" |
| #include "xe_survivability_mode_types.h" |
| |
| #include <linux/kobject.h> |
| #include <linux/pci.h> |
| #include <linux/sysfs.h> |
| |
| #include "xe_device.h" |
| #include "xe_gt.h" |
| #include "xe_heci_gsc.h" |
| #include "xe_mmio.h" |
| #include "xe_pcode_api.h" |
| #include "xe_vsec.h" |
| |
| #define MAX_SCRATCH_MMIO 8 |
| |
| /** |
| * DOC: Xe Boot Survivability |
| * |
| * Boot Survivability is a software based workflow for recovering a system in a failed boot state |
| * Here system recoverability is concerned with recovering the firmware responsible for boot. |
| * |
| * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware |
| * to be flashed through mei and collect telemetry. The driver's probe flow is modified |
| * such that it enters survivability mode when pcode initialization is incomplete and boot status |
| * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating |
| * survivability mode and provides additional information required for debug |
| * |
| * KMD exposes below admin-only readable sysfs in survivability mode |
| * |
| * device/survivability_mode: The presence of this file indicates that the card is in survivability |
| * mode. Also, provides additional information on why the driver entered |
| * survivability mode. |
| * |
| * Capability Information - Provides boot status |
| * Postcode Information - Provides information about the failure |
| * Overflow Information - Provides history of previous failures |
| * Auxiliary Information - Certain failures may have information in |
| * addition to postcode information |
| */ |
| |
| static u32 aux_history_offset(u32 reg_value) |
| { |
| return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); |
| } |
| |
| static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, |
| int id, char *name) |
| { |
| strscpy(info[id].name, name, sizeof(info[id].name)); |
| info[id].reg = PCODE_SCRATCH(id).raw; |
| info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); |
| } |
| |
| static void populate_survivability_info(struct xe_device *xe) |
| { |
| struct xe_survivability *survivability = &xe->survivability; |
| struct xe_survivability_info *info = survivability->info; |
| struct xe_mmio *mmio; |
| u32 id = 0, reg_value; |
| char name[NAME_MAX]; |
| int index; |
| |
| mmio = xe_root_tile_mmio(xe); |
| set_survivability_info(mmio, info, id, "Capability Info"); |
| reg_value = info[id].value; |
| |
| if (reg_value & HISTORY_TRACKING) { |
| id++; |
| set_survivability_info(mmio, info, id, "Postcode Info"); |
| |
| if (reg_value & OVERFLOW_SUPPORT) { |
| id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); |
| set_survivability_info(mmio, info, id, "Overflow Info"); |
| } |
| } |
| |
| if (reg_value & AUXINFO_SUPPORT) { |
| id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); |
| |
| for (index = 0; id && reg_value; index++, reg_value = info[id].value, |
| id = aux_history_offset(reg_value)) { |
| snprintf(name, NAME_MAX, "Auxiliary Info %d", index); |
| set_survivability_info(mmio, info, id, name); |
| } |
| } |
| } |
| |
| static void log_survivability_info(struct pci_dev *pdev) |
| { |
| struct xe_device *xe = pdev_to_xe_device(pdev); |
| struct xe_survivability *survivability = &xe->survivability; |
| struct xe_survivability_info *info = survivability->info; |
| int id; |
| |
| dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", |
| survivability->boot_status); |
| for (id = 0; id < MAX_SCRATCH_MMIO; id++) { |
| if (info[id].reg) |
| dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, |
| info[id].reg, info[id].value); |
| } |
| } |
| |
| static ssize_t survivability_mode_show(struct device *dev, |
| struct device_attribute *attr, char *buff) |
| { |
| struct pci_dev *pdev = to_pci_dev(dev); |
| struct xe_device *xe = pdev_to_xe_device(pdev); |
| struct xe_survivability *survivability = &xe->survivability; |
| struct xe_survivability_info *info = survivability->info; |
| int index = 0, count = 0; |
| |
| for (index = 0; index < MAX_SCRATCH_MMIO; index++) { |
| if (info[index].reg) |
| count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, |
| info[index].reg, info[index].value); |
| } |
| |
| return count; |
| } |
| |
| static DEVICE_ATTR_ADMIN_RO(survivability_mode); |
| |
| static void xe_survivability_mode_fini(void *arg) |
| { |
| struct xe_device *xe = arg; |
| struct pci_dev *pdev = to_pci_dev(xe->drm.dev); |
| struct device *dev = &pdev->dev; |
| |
| sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); |
| } |
| |
| static int enable_survivability_mode(struct pci_dev *pdev) |
| { |
| struct device *dev = &pdev->dev; |
| struct xe_device *xe = pdev_to_xe_device(pdev); |
| struct xe_survivability *survivability = &xe->survivability; |
| int ret = 0; |
| |
| /* create survivability mode sysfs */ |
| ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); |
| if (ret) { |
| dev_warn(dev, "Failed to create survivability sysfs files\n"); |
| return ret; |
| } |
| |
| ret = devm_add_action_or_reset(xe->drm.dev, |
| xe_survivability_mode_fini, xe); |
| if (ret) |
| return ret; |
| |
| /* Make sure xe_heci_gsc_init() knows about survivability mode */ |
| survivability->mode = true; |
| |
| ret = xe_heci_gsc_init(xe); |
| if (ret) { |
| /* |
| * But if it fails, device can't enter survivability |
| * so move it back for correct error handling |
| */ |
| survivability->mode = false; |
| return ret; |
| } |
| |
| xe_vsec_init(xe); |
| |
| dev_err(dev, "In Survivability Mode\n"); |
| |
| return 0; |
| } |
| |
| /** |
| * xe_survivability_mode_is_enabled - check if survivability mode is enabled |
| * @xe: xe device instance |
| * |
| * Returns true if in survivability mode, false otherwise |
| */ |
| bool xe_survivability_mode_is_enabled(struct xe_device *xe) |
| { |
| return xe->survivability.mode; |
| } |
| |
| /* |
| * survivability_mode_requested - check if it's possible to enable |
| * survivability mode and that was requested by firmware |
| * |
| * This function reads the boot status from Pcode. |
| * |
| * Return: true if platform support is available and boot status indicates |
| * failure, false otherwise. |
| */ |
| static bool survivability_mode_requested(struct xe_device *xe) |
| { |
| struct xe_survivability *survivability = &xe->survivability; |
| struct xe_mmio *mmio = xe_root_tile_mmio(xe); |
| u32 data; |
| |
| if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe)) |
| return false; |
| |
| data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); |
| survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); |
| |
| return survivability->boot_status == NON_CRITICAL_FAILURE || |
| survivability->boot_status == CRITICAL_FAILURE; |
| } |
| |
| /** |
| * xe_survivability_mode_enable - Initialize and enable the survivability mode |
| * @xe: xe device instance |
| * |
| * Initialize survivability information and enable survivability mode |
| * |
| * Return: 0 if survivability mode is enabled or not requested; negative error |
| * code otherwise. |
| */ |
| int xe_survivability_mode_enable(struct xe_device *xe) |
| { |
| struct xe_survivability *survivability = &xe->survivability; |
| struct xe_survivability_info *info; |
| struct pci_dev *pdev = to_pci_dev(xe->drm.dev); |
| |
| if (!survivability_mode_requested(xe)) |
| return 0; |
| |
| survivability->size = MAX_SCRATCH_MMIO; |
| |
| info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), |
| GFP_KERNEL); |
| if (!info) |
| return -ENOMEM; |
| |
| survivability->info = info; |
| |
| populate_survivability_info(xe); |
| |
| /* Only log debug information and exit if it is a critical failure */ |
| if (survivability->boot_status == CRITICAL_FAILURE) { |
| log_survivability_info(pdev); |
| return -ENXIO; |
| } |
| |
| return enable_survivability_mode(pdev); |
| } |