nvmesensor: fix SdBusError crash during host warm-reboot
Port fix from https://github.com/amboar/dbus-sensors/commits/mctp-mi
Background: https://github.com/CodeConstruct/dbus-sensors/issues/1
Tested:
After the fix, the nvmesensor won't crash during host warm-reboot
Google-Bug-Id: 301320438
Change-Id: Iffe54c47958e22367c84394cd71588f31326be0f
Signed-off-by: Jinliang Wang <jinliangw@google.com>
(cherry picked from commit 2dfb61dec61701a3602a5efdddbee03675edfb58)
diff --git a/recipes-phosphor/sensors/dbus-sensors/0025-nvmesensor-Add-getPrimaryController-helper.patch b/recipes-phosphor/sensors/dbus-sensors/0025-nvmesensor-Add-getPrimaryController-helper.patch
new file mode 100644
index 0000000..e895e37
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0025-nvmesensor-Add-getPrimaryController-helper.patch
@@ -0,0 +1,153 @@
+From 5b4406829b423397b3b708b3172d3590440b07a7 Mon Sep 17 00:00:00 2001
+From: Matt Johnston <matt@codeconstruct.com.au>
+Date: Thu, 13 Jul 2023 15:58:27 +0800
+Subject: [PATCH 1/7] nvmesensor: Add getPrimaryController() helper
+
+This avoids prevents null pointer dereferences when dbus calls are
+received for non-functional subsystems (no controllers available).
+That can occur when a controller is set non-functional due to failing
+health checks.
+
+A Common::Error::Unavailable exception is thrown when no controller is
+available.
+
+Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
+Change-Id: I9aeb16c48d05dab8ac76ef36bc37239157d659d6
+Signed-off-by: Jinliang Wang <jinliangw@google.com>
+---
+ src/NVMeSubsys.cpp | 37 +++++++++++++++++++++++++------------
+ src/NVMeSubsys.hpp | 5 ++++-
+ 2 files changed, 29 insertions(+), 13 deletions(-)
+
+diff --git a/src/NVMeSubsys.cpp b/src/NVMeSubsys.cpp
+index 2e94abb..69d0d4f 100644
+--- a/src/NVMeSubsys.cpp
++++ b/src/NVMeSubsys.cpp
+@@ -376,6 +376,19 @@ void NVMeSubsystem::markAvailable(bool toggle)
+ // TODO: make the Available interface false
+ UnavailableCount = UnavailableMaxCount;
+ }
++
++std::shared_ptr<NVMeControllerEnabled>
++ NVMeSubsystem::getPrimaryController() const
++{
++ if (!primaryController)
++ {
++ std::cerr << "dbus call for inactive NVMe subsystem " << name
++ << ". Returning Unavailable\n";
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++ return primaryController;
++}
++
+ void NVMeSubsystem::start()
+ {
+ // add thermal sensor for the subsystem
+@@ -629,7 +642,7 @@ sdbusplus::message::object_path
+ std::make_shared<CommonErr::InvalidArgument>());
+ }
+
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ using callback_t = void(std::tuple<nvme_ex_ptr, NVMeNSIdentify>);
+@@ -719,7 +732,7 @@ std::string NVMeSubsystem::volumePath(uint32_t nsid) const
+
+ void NVMeSubsystem::addIdentifyNamespace(uint32_t nsid)
+ {
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminIdentify(ctrl, nvme_identify_cns::NVME_IDENTIFY_CNS_ALLOCATED_NS,
+ nsid, NVME_CNTLID_NONE,
+@@ -781,7 +794,7 @@ void NVMeSubsystem::addIdentifyNamespace(uint32_t nsid)
+
+ void NVMeSubsystem::updateVolumes()
+ {
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminListNamespaces(
+ ctrl, [ctrl, intf, self{shared_from_this()}](
+@@ -822,7 +835,7 @@ void NVMeSubsystem::updateVolumes()
+
+ void NVMeSubsystem::fillDrive()
+ {
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminIdentify(ctrl, nvme_identify_cns::NVME_IDENTIFY_CNS_CTRL,
+ NVME_NSID_NONE, NVME_CNTLID_NONE,
+@@ -845,10 +858,10 @@ void NVMeSubsystem::fillDrive()
+ // https://gerrit.openbmc.org/c/openbmc/phosphor-dbus-interfaces/+/43458/2/xyz/openbmc_project/Software/Version.interface.yaml#47
+ // TODO find/write a better reference
+ std::string v("xyz.openbmc_project.NVMe.ControllerFirmwareVersion");
+- self->primaryController->version(v);
+- self->primaryController->purpose(
+- SoftwareVersion::VersionPurpose::Other);
+- self->primaryController->extendedVersion(v + ":" + fwVer);
++ auto pc = self->getPrimaryController();
++ pc->version(v);
++ pc->purpose(SoftwareVersion::VersionPurpose::Other);
++ pc->extendedVersion(v + ":" + fwVer);
+ }
+ });
+ }
+@@ -1000,7 +1013,7 @@ void NVMeSubsystem::forgetVolume(std::shared_ptr<NVMeVolume> volume)
+ void NVMeSubsystem::querySupportedFormats()
+ {
+
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminIdentify(
+ ctrl, nvme_identify_cns::NVME_IDENTIFY_CNS_NS, NVME_NSID_ALL,
+@@ -1044,7 +1057,7 @@ void NVMeSubsystem::querySupportedFormats()
+ void NVMeSubsystem::deleteVolume(boost::asio::yield_context yield,
+ std::shared_ptr<NVMeVolume> volume)
+ {
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ using callback_t = void(std::tuple<std::error_code, int>);
+@@ -1072,7 +1085,7 @@ void NVMeSubsystem::startSanitize(
+ const NVMeSanitizeParams& params,
+ std::function<void(nvme_ex_ptr ex)>&& submitCb)
+ {
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ intf->adminSanitize(ctrl, params.nvmeAction(), params.passes,
+@@ -1085,7 +1098,7 @@ void NVMeSubsystem::sanitizeStatus(
+ bool completed, uint16_t sstat, uint16_t sprog,
+ uint32_t scdw10)>&& cb)
+ {
+- nvme_mi_ctrl_t ctrl = primaryController->getMiCtrl();
++ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ intf->adminGetLogPage(
+diff --git a/src/NVMeSubsys.hpp b/src/NVMeSubsys.hpp
+index 0b37cc0..6a8ebf2 100644
+--- a/src/NVMeSubsys.hpp
++++ b/src/NVMeSubsys.hpp
+@@ -127,9 +127,12 @@ class NVMeSubsystem :
+ createProgress;
+
+ // controller to use for NVMe operations. Is a member of the controllers
+- // map.
++ // map. Access this through getPrimaryController() to test for nullness.
+ std::shared_ptr<NVMeControllerEnabled> primaryController;
+
++ // may throw NVMeError if no controller is available
++ std::shared_ptr<NVMeControllerEnabled> getPrimaryController() const;
++
+ std::shared_ptr<sdbusplus::asio::dbus_interface> assocIntf;
+ void createAssociation();
+ void updateAssociation();
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors/0026-NVMeDrive-Use-constexpr-to-avoid-storage-and-relocat.patch b/recipes-phosphor/sensors/dbus-sensors/0026-NVMeDrive-Use-constexpr-to-avoid-storage-and-relocat.patch
new file mode 100644
index 0000000..3938019
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0026-NVMeDrive-Use-constexpr-to-avoid-storage-and-relocat.patch
@@ -0,0 +1,28 @@
+From a3facbccba9b39bb8238ce989707af4d05b8bb55 Mon Sep 17 00:00:00 2001
+From: Andrew Jeffery <andrew@codeconstruct.com.au>
+Date: Wed, 27 Sep 2023 10:21:35 +0930
+Subject: [PATCH 2/7] NVMeDrive: Use constexpr to avoid storage and relocation
+ failure
+
+Change-Id: Ib16470129519e67510520ceb9db6d6221713c7b1
+Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
+---
+ src/NVMeDrive.hpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/NVMeDrive.hpp b/src/NVMeDrive.hpp
+index 708a6e9..40e24eb 100644
+--- a/src/NVMeDrive.hpp
++++ b/src/NVMeDrive.hpp
+@@ -48,7 +48,7 @@ class NVMeDrive :
+ public std::enable_shared_from_this<NVMeDrive>
+ {
+ public:
+- static const int sanitizePollIntervalSecs = 4;
++ static constexpr const int sanitizePollIntervalSecs = 4;
+
+ NVMeDrive(boost::asio::io_context& io,
+ std::shared_ptr<sdbusplus::asio::connection> conn,
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors/0027-NVMeSubsys-Uphold-safety-properties-in-markFunctiona.patch b/recipes-phosphor/sensors/dbus-sensors/0027-NVMeSubsys-Uphold-safety-properties-in-markFunctiona.patch
new file mode 100644
index 0000000..1784926
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0027-NVMeSubsys-Uphold-safety-properties-in-markFunctiona.patch
@@ -0,0 +1,146 @@
+From 9a6d2ff67b1ee1d69d117d1768fc27f997e1c9cd Mon Sep 17 00:00:00 2001
+From: Andrew Jeffery <andrew@codeconstruct.com.au>
+Date: Thu, 12 Oct 2023 14:29:50 +1030
+Subject: [PATCH 3/7] NVMeSubsys: Uphold safety properties in markFunctional()
+
+NVMeSubsystem::markFunction(true) invokes nvme_mi_ep_scan(..., true) via
+NVMeMiIntf::miScanCtrl(). A forced rescan of the endpoint invalidates
+all nvme_mi_ctrl objects maintained by the nvme_mi_ep instance.
+NVMeController instances held in NVMeSubsystem::controllers each hold a
+pointer to an associated nvme_mi_ctrl object, but in a non-owning
+relationship.
+
+Isolate the logic to ensure the subsystem is in Status::Stop prior to
+transitioning to Status::Initializing in the implementation of
+NVMeSubsystem::markFunctional() so it's clear we don't have consecutive
+calls to nvme_mi_ep_scan(..., true) without first destroying the
+NVMeController instances maintained in NVMeSubsystem::controllers. These
+are destroyed by a transition through Status::Stop via
+NVMeSubsys::markFunctional(false).
+
+An existing constraint is that it's invalid to invoke
+NVMeSubsystem::markFunctional(false) while the subsystem is in
+Status::Initializing. For specific error handling cases where
+NVMeSubsystem::markFunctional(false) had to be invoked the
+implementation open-coded the assignment of Status::Stop immediately
+prior.
+
+These assignments to NVMeSubsystem::status made it harder to
+reason about the properties Status::Stop was upholding: We rely
+Status::Stop to assert that NVMeController instances have been
+destructed prior to invalidating the nvme_mi_ctrl objects. To improve
+comprehension, introduce a Status::Aborting state to identify the period
+where the subsystem will be transitioned to Status::Stop but its
+NVMeController objects are still live.
+
+Change-Id: I98221b1c28dcdfaf9829985e8f7ef7d54222ac6b
+Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
+---
+ src/NVMeSubsys.cpp | 39 ++++++++++++++++++++++++++++++---------
+ src/NVMeSubsys.hpp | 1 +
+ 2 files changed, 31 insertions(+), 9 deletions(-)
+
+diff --git a/src/NVMeSubsys.cpp b/src/NVMeSubsys.cpp
+index 69d0d4f..3f289ee 100644
+--- a/src/NVMeSubsys.cpp
++++ b/src/NVMeSubsys.cpp
+@@ -132,7 +132,7 @@ void NVMeSubsystem::processSecondaryControllerList(nvme_secondary_ctrl_list* sec
+ {
+ std::cerr << "fail to match primary controller from "
+ "identify sencondary cntrl list" << std::endl;
+- status = Status::Stop;
++ status = Status::Aborting;
+ markFunctional(false);
+ markAvailable(false);
+ return;
+@@ -228,10 +228,35 @@ void NVMeSubsystem::markFunctional(bool toggle)
+ // plugin.reset();
+ return;
+ }
++
+ if (status == Status::Intiatilzing)
+ {
+ throw std::runtime_error("cannot start: the subsystem is intiatilzing");
+ }
++
++ if (status == Status::Aborting)
++ {
++ throw std::runtime_error(
++ "cannot start: subsystem initialisation has aborted and must transition to stopped");
++ }
++
++ if (status == Status::Start)
++ {
++ // Prevent consecutive calls to NVMeMiIntf::miScanCtrl()
++ //
++ // NVMeMiIntf::miScanCtrl() calls nvme_mi_scan_ep(..., true), which
++ // forces a rescan and invalidates any nvme_mi_ctrl objects created on a
++ // previous scan.
++ //
++ // We require a transition through Status::Stop (via
++ // `markFunctional(false)`) so that the lifetime of the NVMeController
++ // instances in this->controllers do not exceed the lifetime of their
++ // associated nvme_mi_ctrl object.
++ return;
++ }
++
++ assert(status == Status::Stop);
++
+ status = Status::Intiatilzing;
+ markAvailable(toggle);
+
+@@ -249,7 +274,7 @@ void NVMeSubsystem::markFunctional(bool toggle)
+ // TODO: mark the subsystem invalid and reschedule refresh
+ std::cerr << "fail to scan controllers for the nvme subsystem"
+ << (ec ? ": " + ec.message() : "") << std::endl;
+- self->status = Status::Stop;
++ self->status = Status::Aborting;
+ self->markFunctional(false);
+ self->markAvailable(false);
+ return;
+@@ -317,7 +342,7 @@ void NVMeSubsystem::markFunctional(bool toggle)
+ {
+ std::cerr << "fail to identify secondary controller list"
+ << std::endl;
+- self->status = Status::Stop;
++ self->status = Status::Aborting;
+ self->markFunctional(false);
+ self->markAvailable(false);
+ // std::cerr << "fail to identify secondary controller list"
+@@ -344,7 +369,7 @@ void NVMeSubsystem::markFunctional(bool toggle)
+ {
+ std::cerr << "empty identify secondary controller list"
+ << std::endl;
+- self->status = Status::Stop;
++ self->status = Status::Aborting;
+ self->markFunctional(false);
+ self->markAvailable(false);
+ return;
+@@ -571,11 +596,7 @@ void NVMeSubsystem::start()
+ return;
+ }
+
+- // restart the subsystem
+- if (self->status == Status::Stop)
+- {
+- self->markFunctional(true);
+- }
++ self->markFunctional(true);
+
+ // TODO: update the drive interface
+
+diff --git a/src/NVMeSubsys.hpp b/src/NVMeSubsys.hpp
+index 6a8ebf2..a975f0b 100644
+--- a/src/NVMeSubsys.hpp
++++ b/src/NVMeSubsys.hpp
+@@ -81,6 +81,7 @@ class NVMeSubsystem :
+ {
+ Stop,
+ Intiatilzing,
++ Aborting,
+ Start,
+ };
+
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors/0028-NVMeController-Expose-disable-to-prevent-queueing-jo.patch b/recipes-phosphor/sensors/dbus-sensors/0028-NVMeController-Expose-disable-to-prevent-queueing-jo.patch
new file mode 100644
index 0000000..0a479d2
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0028-NVMeController-Expose-disable-to-prevent-queueing-jo.patch
@@ -0,0 +1,190 @@
+From b0de52f5a433422d1938433c8b549d3a9793bb01 Mon Sep 17 00:00:00 2001
+From: Andrew Jeffery <andrew@codeconstruct.com.au>
+Date: Wed, 25 Oct 2023 13:59:59 +1030
+Subject: [PATCH 4/7] NVMeController: Expose disable() to prevent queueing jobs
+ prior to destruction
+
+The lifecycle of an NVMeController is partially tied to related D-Bus interfaces
+where the method lambdas promote its weak pointer to shared. This interplays
+with e.g. the ctemp timer determining that the related endpoint has become
+inaccessible and the entire subsystem being marked as non-functional.
+
+A call through the NVMeController instance's D-Bus APIs will hold the D-Bus
+object available until the call completes, providing opportunity for further
+calls to occur even though the the object is only exposed as an existing call is
+in progress. These subsequent calls interfere with the effort to destroy the
+controller and rescan the endpoint. Rescanning the endpoint invalidates the
+nvme_mi_ctrl_t held by the NVMeController instance. We must prevent it from
+occurring prior to all associated NVMeController instances being destructed else
+the held nvme_mi_ctrl_t will point to an invalid object in memory.
+
+Expose NVMeController::disable() as a trapdoor state transition: Once
+NVMeController::disable() is invoked all calls through the query APIs will be
+rejected and no further jobs will be submitted to the associated NVMeMi Worker
+IO thread. The only exit from the disabled state is for the NVMeController to be
+destructed.
+
+Change-Id: If991588d5c17d021b49619e2bd610b65d88fca88
+Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
+---
+ src/NVMeController.cpp | 61 +++++++++++++++++++++++++++++++++++++-----
+ src/NVMeController.hpp | 14 ++++++++++
+ 2 files changed, 69 insertions(+), 6 deletions(-)
+
+diff --git a/src/NVMeController.cpp b/src/NVMeController.cpp
+index d1dff13..336a5ed 100644
+--- a/src/NVMeController.cpp
++++ b/src/NVMeController.cpp
+@@ -180,6 +180,12 @@ sdbusplus::message::unix_fd NVMeControllerEnabled::getLogPage(uint8_t lid,
+ uint8_t lsp,
+ uint16_t lsi)
+ {
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ std::array<int, 2> pipe;
+ if (::pipe(pipe.data()) < 0)
+ {
+@@ -264,6 +270,12 @@ sdbusplus::message::unix_fd NVMeControllerEnabled::getLogPage(uint8_t lid,
+ sdbusplus::message::unix_fd
+ NVMeControllerEnabled::identify(uint8_t cns, uint32_t nsid, uint16_t cntid)
+ {
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ std::array<int, 2> pipe;
+ if (::pipe(pipe.data()) < 0)
+ {
+@@ -316,6 +328,13 @@ void NVMeControllerEnabled::firmwareCommitAsync(uint8_t commitAction,
+ {
+ throw sdbusplus::xyz::openbmc_project::Common::Error::NotAllowed();
+ }
++
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ this->NVMeAdmin::firmwareCommitStatus(FwCommitStatus::InProgress);
+ nvmeIntf->adminFwCommit(
+ nvmeCtrl, static_cast<nvme_fw_commit_ca>(commitAction & 0b111),
+@@ -432,6 +451,12 @@ void NVMeControllerEnabled::securitySendMethod(boost::asio::yield_context yield,
+ uint16_t proto_specific,
+ std::span<uint8_t> data)
+ {
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ using callback_t = void(std::tuple<std::error_code, int>);
+ auto [err, nvme_status] =
+ boost::asio::async_initiate<boost::asio::yield_context, callback_t>(
+@@ -455,6 +480,12 @@ std::vector<uint8_t> NVMeControllerEnabled::securityReceiveMethod(
+ boost::asio::yield_context yield, uint8_t proto, uint16_t proto_specific,
+ uint32_t transfer_length)
+ {
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ using callback_t =
+ void(std::tuple<std::error_code, int, std::vector<uint8_t>>);
+ auto [err, nvme_status, data] =
+@@ -530,6 +561,12 @@ void NVMeControllerEnabled::attachVolume(
+ boost::asio::yield_context yield,
+ const sdbusplus::message::object_path& volumePath)
+ {
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ uint32_t nsid;
+ if (auto s = subsys.lock())
+ {
+@@ -563,17 +600,26 @@ void NVMeControllerEnabled::attachVolume(
+ // exception must be thrown outside of the async block
+ checkLibNVMeError(err, nvme_status, "attachVolume");
+
+- if (auto s = subsys.lock())
++ if (!disabled())
+ {
+- s->attachCtrlVolume(getCntrlId(), nsid);
++ if (auto s = subsys.lock())
++ {
++ s->attachCtrlVolume(getCntrlId(), nsid);
++ }
++ updateAssociation();
+ }
+- updateAssociation();
+ }
+
+ void NVMeControllerEnabled::detachVolume(
+ boost::asio::yield_context yield,
+ const sdbusplus::message::object_path& volumePath)
+ {
++ if (disabled())
++ {
++ std::cerr << "Controller has been disabled" << std::endl;
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ uint32_t nsid;
+ if (auto s = subsys.lock())
+ {
+@@ -607,9 +653,12 @@ void NVMeControllerEnabled::detachVolume(
+ // exception must be thrown outside of the async block
+ checkLibNVMeError(err, nvme_status, "detachVolume");
+
+- if (auto s = subsys.lock())
++ if (!disabled())
+ {
+- s->detachCtrlVolume(getCntrlId(), nsid);
++ if (auto s = subsys.lock())
++ {
++ s->detachCtrlVolume(getCntrlId(), nsid);
++ }
++ updateAssociation();
+ }
+- updateAssociation();
+ }
+diff --git a/src/NVMeController.hpp b/src/NVMeController.hpp
+index b901166..9274646 100644
+--- a/src/NVMeController.hpp
++++ b/src/NVMeController.hpp
+@@ -74,6 +74,20 @@ class NVMeController
+ std::max(sizeof(uint16_t), sizeof(void*))));
+ }
+
++ /*
++ * Disable is called to prevent issuing further queries against the
++ * controller
++ */
++ void disable()
++ {
++ nvmeCtrl = nullptr;
++ }
++
++ bool disabled() const
++ {
++ return nvmeCtrl == nullptr;
++ }
++
+ /**
+ * @brief Get the NVMe controller handle
+ */
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors/0029-NVMeProgress-Expose-abort-for-cancelling-progress-ob.patch b/recipes-phosphor/sensors/dbus-sensors/0029-NVMeProgress-Expose-abort-for-cancelling-progress-ob.patch
new file mode 100644
index 0000000..49ae6bf
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0029-NVMeProgress-Expose-abort-for-cancelling-progress-ob.patch
@@ -0,0 +1,51 @@
+From 3449cb2e29ca1962d39cc47d63b2d8d33c78eee5 Mon Sep 17 00:00:00 2001
+From: Andrew Jeffery <andrew@codeconstruct.com.au>
+Date: Wed, 25 Oct 2023 14:22:37 +1030
+Subject: [PATCH 5/7] NVMeProgress: Expose abort() for cancelling progress
+ objects
+
+If progress is prevented by an event outside the context of an active query the
+progress object should signal an out-of-band failure to any consumers. An abort
+might occur, for example, as a result of the NVMe MI endpoint becoming
+unavailable prior to completing the task. Marking the progress object as aborted
+instead of removing it from the bus helps provide sensible semantics through the
+swordfish interface.
+
+Change-Id: Ie962199f66f231e2a10c114fc67a508e2adccf64
+Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
+---
+ src/NVMeProgress.cpp | 5 +++++
+ src/NVMeProgress.hpp | 1 +
+ 2 files changed, 6 insertions(+)
+
+diff --git a/src/NVMeProgress.cpp b/src/NVMeProgress.cpp
+index 076cc93..8a18377 100644
+--- a/src/NVMeProgress.cpp
++++ b/src/NVMeProgress.cpp
+@@ -34,6 +34,11 @@ void NVMeProgress::fail()
+ status(OperationStatus::Failed);
+ }
+
++void NVMeProgress::abort()
++{
++ status(OperationStatus::Aborted);
++}
++
+ NVMeCreateVolumeProgress::NVMeCreateVolumeProgress(
+ std::shared_ptr<sdbusplus::asio::connection> conn,
+ const std::string& path) :
+diff --git a/src/NVMeProgress.hpp b/src/NVMeProgress.hpp
+index b472659..291ecb8 100644
+--- a/src/NVMeProgress.hpp
++++ b/src/NVMeProgress.hpp
+@@ -34,6 +34,7 @@ class NVMeProgress :
+
+ void complete();
+ void fail();
++ void abort();
+ };
+
+ class NVMeCreateVolumeProgress : public NVMeProgress
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors/0030-NVMeMiIntf-Expose-flushOperations-to-track-MI-IO-job.patch b/recipes-phosphor/sensors/dbus-sensors/0030-NVMeMiIntf-Expose-flushOperations-to-track-MI-IO-job.patch
new file mode 100644
index 0000000..fd95dec
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0030-NVMeMiIntf-Expose-flushOperations-to-track-MI-IO-job.patch
@@ -0,0 +1,76 @@
+From c4ff787652e4828c7378c0a08459939ebd62c4ba Mon Sep 17 00:00:00 2001
+From: Andrew Jeffery <andrew@codeconstruct.com.au>
+Date: Wed, 25 Oct 2023 14:32:45 +1030
+Subject: [PATCH 6/7] NVMeMiIntf: Expose flushOperations() to track MI IO job
+ completion
+
+NVMeMi::flushOperations() invokes the provided callback back on the originating
+thread after all jobs prior to its submission have been completed. Its intended
+use is to flag when it is safe to rescan the MI endpoint via libnvme.
+
+Change-Id: If1c5623b93d0a8ac97b347a128f42332c5d2d30d
+Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
+---
+ src/NVMeIntf.hpp | 2 ++
+ src/NVMeMi.cpp | 17 +++++++++++++++++
+ src/NVMeMi.hpp | 3 +++
+ 3 files changed, 22 insertions(+)
+
+diff --git a/src/NVMeIntf.hpp b/src/NVMeIntf.hpp
+index 7de7a33..073d7fb 100644
+--- a/src/NVMeIntf.hpp
++++ b/src/NVMeIntf.hpp
+@@ -175,6 +175,8 @@ class NVMeMiIntf
+
+ virtual ~NVMeMiIntf() = default;
+
++ virtual bool flushOperations(std::function<void()>&& cb) = 0;
++
+ virtual void adminIdentify(
+ nvme_mi_ctrl_t ctrl, nvme_identify_cns cns, uint32_t nsid,
+ uint16_t cntid,
+diff --git a/src/NVMeMi.cpp b/src/NVMeMi.cpp
+index a9b65f9..4cfba4b 100644
+--- a/src/NVMeMi.cpp
++++ b/src/NVMeMi.cpp
+@@ -673,6 +673,23 @@ void NVMeMi::miScanCtrl(std::function<void(const std::error_code&,
+ }
+ }
+
++bool NVMeMi::flushOperations(std::function<void()>&& cb)
++{
++ try
++ {
++ post([self{shared_from_this()}, cb{std::move(cb)}]() {
++ self->io.post(cb);
++ });
++
++ return true;
++ }
++ catch (const std::runtime_error& e)
++ {
++ std::cerr << "Runtime error: " << e.what() << std::endl;
++ return false;
++ }
++}
++
+ void NVMeMi::adminIdentify(
+ nvme_mi_ctrl_t ctrl, nvme_identify_cns cns, uint32_t nsid, uint16_t cntid,
+ std::function<void(nvme_ex_ptr, std::span<uint8_t>)>&& cb)
+diff --git a/src/NVMeMi.hpp b/src/NVMeMi.hpp
+index 0e5f50f..43e33cc 100644
+--- a/src/NVMeMi.hpp
++++ b/src/NVMeMi.hpp
+@@ -22,6 +22,9 @@ class NVMeMi : public NVMeMiIntf, public std::enable_shared_from_this<NVMeMi>
+ {
+ return eid;
+ }
++
++ bool flushOperations(std::function<void()>&& cb) override;
++
+ void miSubsystemHealthStatusPoll(
+ std::function<void(const std::error_code&,
+ nvme_mi_nvm_ss_health_status*)>&& cb) override;
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors/0031-NVMeSubsys-Clean-up-invalidated-controllers-volumes-.patch b/recipes-phosphor/sensors/dbus-sensors/0031-NVMeSubsys-Clean-up-invalidated-controllers-volumes-.patch
new file mode 100644
index 0000000..92f3252
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0031-NVMeSubsys-Clean-up-invalidated-controllers-volumes-.patch
@@ -0,0 +1,345 @@
+From b60a102a5143263fe8d1417d835898e9989d7cec Mon Sep 17 00:00:00 2001
+From: Andrew Jeffery <andrew@codeconstruct.com.au>
+Date: Wed, 25 Oct 2023 14:39:11 +1030
+Subject: [PATCH 7/7] NVMeSubsys: Clean-up invalidated controllers, volumes and
+ associations
+
+If the subsystem is marked as non-functional, for example due to the endpoint
+becoming unavailable, we must rescan the endpoint to re-establish its
+controllers, volumes and the associations. However, prior to rescanning we must
+ensure that it is safe to do so. Rescanning is not safe before all in-progress
+IO jobs have completed as the IO jobs hold an nvme_mi_ctrl_t, whose lifetime is
+tied to each scan period (rescanning invalidates the nvme_mi_ctrl_t).
+
+Introduce a Status::Terminating state that is entered upon invocation of
+`NVMeSubsystem::markFunctional(false)`. Status::Terminating indicates that the
+subsystem must eventually enter Status::Stop, but an unknown number of
+nvme_mi_ctrl object pointers remain live. We maintain the Status::Terminating
+state until all active IO jobs have completed, and we enter Status::Stop from
+Status::Terminating by flushing the NVMeMi worker queue with a job whose
+completion transitions the subsystem to the Status::Stop state.
+
+The fundamental invariant of the Status::Stop state is that no dangling
+references to nvme_mi_ctrl objects exist, asserting it safe to (re)scan the
+endpoint. To uphold it we must ensure that no further jobs can be enqueued in
+the IO thread subsequent to the flush job, in addition to the flush job
+completion signalling any existing queued jobs have been completed. We prevent
+this by requiring the subsystem be in Status::Start for any admin queries to be
+enqueued, and use NVMeController::disable() to prevent enqueuing of any
+controller queries.
+
+Having addressed the safety of any remaining nvme_mi_ctrl pointers, clear out
+the subsystem's own references to the controllers, volumes and their
+associations to drop the objects from D-Bus once all IO jobs holding shared
+pointers complete.
+
+Change-Id: I5315b53b375cf4fcf3bab7001d2a7055958fc552
+Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
+---
+ src/NVMeSubsys.cpp | 137 ++++++++++++++++++++++++++++++++++++++++++---
+ src/NVMeSubsys.hpp | 11 ++++
+ 2 files changed, 140 insertions(+), 8 deletions(-)
+
+diff --git a/src/NVMeSubsys.cpp b/src/NVMeSubsys.cpp
+index 3f289ee..63d9422 100644
+--- a/src/NVMeSubsys.cpp
++++ b/src/NVMeSubsys.cpp
+@@ -6,6 +6,7 @@
+ #include "NVMePlugin.hpp"
+ #include "NVMeUtil.hpp"
+ #include "Thresholds.hpp"
++#include "xyz/openbmc_project/Common/error.hpp"
+
+ #include <charconv>
+ #include <filesystem>
+@@ -216,16 +217,60 @@ void NVMeSubsystem::markFunctional(bool toggle)
+ throw std::runtime_error(
+ "cannot stop: the subsystem is intiatilzing");
+ }
+- status = Status::Stop;
++
++ if (status == Status::Terminating || status == Status::Stop)
++ {
++ return;
++ }
++
++ assert(status == Status::Start || status == Status::Aborting);
++
++ status = Status::Terminating;
+ if (plugin)
+ {
+ plugin->stop();
+ }
+ // TODO: the controller should be stopped after controller level polling
+ // is enabled
++ // Tell any progress objects we're aborting the operation
++ for (auto& [_, v] : createProgress)
++ {
++ v->abort();
++ }
++
++ // Tell the controllers they mustn't post further jobs
++ primaryController->disable();
++ for (auto& [_, v] : controllers)
++ {
++ v.first->disable();
++ }
+
++ // Avoid triggering C*V D-Bus updates by clearing internal state
++ // directly. The controller and volume objects and interfaces will be
++ // removed which will update the mapper.
++ attached.clear();
++ volumes.clear();
++ primaryController.reset();
+ controllers.clear();
+ // plugin.reset();
++
++ updateAssociation();
++
++ if (nvmeIntf.getProtocol() == NVMeIntf::Protocol::NVMeMI)
++ {
++ auto nvme =
++ std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
++
++ bool posted = nvme->flushOperations([self{shared_from_this()}]() mutable {
++ self->status = Status::Stop;
++ });
++
++ if (!posted)
++ {
++ std::cerr << "Failed to flush operations, subsystem has stalled!" << std::endl;
++ }
++ }
++
+ return;
+ }
+
+@@ -240,7 +285,7 @@ void NVMeSubsystem::markFunctional(bool toggle)
+ "cannot start: subsystem initialisation has aborted and must transition to stopped");
+ }
+
+- if (status == Status::Start)
++ if (status == Status::Start || status == Status::Terminating)
+ {
+ // Prevent consecutive calls to NVMeMiIntf::miScanCtrl()
+ //
+@@ -647,6 +692,11 @@ sdbusplus::message::object_path
+ NVMeSubsystem::createVolume(boost::asio::yield_context yield, uint64_t size,
+ size_t lbaFormat, bool metadataAtEnd)
+ {
++ if (status != Status::Start)
++ {
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
+ // Checks argument
+ if (lbaFormat >= supportedFormats.size())
+ {
+@@ -663,7 +713,10 @@ sdbusplus::message::object_path
+ std::make_shared<CommonErr::InvalidArgument>());
+ }
+
+- nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
++ nvme_mi_ctrl_t ctrl = pc->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ using callback_t = void(std::tuple<nvme_ex_ptr, NVMeNSIdentify>);
+@@ -719,6 +772,12 @@ void NVMeSubsystem::createVolumeFinished(std::string prog_id, nvme_ex_ptr ex,
+ }
+ auto prog = p->second;
+
++ if (prog->status() == OperationStatus::Aborted)
++ {
++ return;
++ }
++ assert(status == Status::Start);
++
+ if (ex)
+ {
+ prog->createFailure(ex);
+@@ -753,7 +812,16 @@ std::string NVMeSubsystem::volumePath(uint32_t nsid) const
+
+ void NVMeSubsystem::addIdentifyNamespace(uint32_t nsid)
+ {
++ if (status != Status::Start)
++ {
++ std::cerr << "Subsystem not in Start state, have " << static_cast<int>(status) << std::endl;
++ return;
++ }
++
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
+ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminIdentify(ctrl, nvme_identify_cns::NVME_IDENTIFY_CNS_ALLOCATED_NS,
+ nsid, NVME_CNTLID_NONE,
+@@ -815,7 +883,11 @@ void NVMeSubsystem::addIdentifyNamespace(uint32_t nsid)
+
+ void NVMeSubsystem::updateVolumes()
+ {
++ assert(status == Status::Start);
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
+ nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminListNamespaces(
+ ctrl, [ctrl, intf, self{shared_from_this()}](
+@@ -856,7 +928,11 @@ void NVMeSubsystem::updateVolumes()
+
+ void NVMeSubsystem::fillDrive()
+ {
+- nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++ assert(status == Status::Start);
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
++ nvme_mi_ctrl_t ctrl = pc->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminIdentify(ctrl, nvme_identify_cns::NVME_IDENTIFY_CNS_CTRL,
+ NVME_NSID_NONE, NVME_CNTLID_NONE,
+@@ -939,6 +1015,11 @@ std::vector<uint32_t> NVMeSubsystem::attachedVolumes(uint16_t ctrlId) const
+
+ void NVMeSubsystem::attachCtrlVolume(uint16_t c, uint32_t ns)
+ {
++ if (status != Status::Start)
++ {
++ std::cerr << "Subsystem is not in Start state: " << static_cast<int>(status) << std::endl;
++ return;
++ }
+ if (!controllers.contains(c))
+ {
+ std::cerr << "attachCtrlVolume bad controller " << c << std::endl;
+@@ -956,6 +1037,11 @@ void NVMeSubsystem::attachCtrlVolume(uint16_t c, uint32_t ns)
+
+ void NVMeSubsystem::detachCtrlVolume(uint16_t c, uint32_t ns)
+ {
++ if (status != Status::Start)
++ {
++ std::cerr << "Subsystem is not in Start state: " << static_cast<int>(status) << std::endl;
++ return;
++ }
+ if (!controllers.contains(c))
+ {
+ std::cerr << "detachCtrlVolume bad controller " << c << std::endl;
+@@ -973,6 +1059,11 @@ void NVMeSubsystem::detachCtrlVolume(uint16_t c, uint32_t ns)
+
+ void NVMeSubsystem::detachAllCtrlVolume(uint32_t ns)
+ {
++ if (status != Status::Start)
++ {
++ std::cerr << "Subsystem is not in Start state: " << static_cast<int>(status) << std::endl;
++ return;
++ }
+ if (!volumes.contains(ns))
+ {
+ std::cerr << "detachAllCtrlVolume bad ns " << ns << std::endl;
+@@ -991,6 +1082,8 @@ void NVMeSubsystem::detachAllCtrlVolume(uint32_t ns)
+ // Will throw a nvme_ex_ptr if the NS already exists */
+ std::shared_ptr<NVMeVolume> NVMeSubsystem::addVolume(const NVMeNSIdentify& ns)
+ {
++ assert(status == Status::Start);
++
+ if (volumes.contains(ns.namespaceId))
+ {
+ std::string err_msg = std::string("Internal error, NSID exists " +
+@@ -1033,8 +1126,11 @@ void NVMeSubsystem::forgetVolume(std::shared_ptr<NVMeVolume> volume)
+
+ void NVMeSubsystem::querySupportedFormats()
+ {
++ assert(status == Status::Start);
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
++ nvme_mi_ctrl_t ctrl = pc->getMiCtrl();
+
+- nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ intf->adminIdentify(
+ ctrl, nvme_identify_cns::NVME_IDENTIFY_CNS_NS, NVME_NSID_ALL,
+@@ -1078,7 +1174,15 @@ void NVMeSubsystem::querySupportedFormats()
+ void NVMeSubsystem::deleteVolume(boost::asio::yield_context yield,
+ std::shared_ptr<NVMeVolume> volume)
+ {
+- nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++ if (status != Status::Start)
++ {
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
++ nvme_mi_ctrl_t ctrl = pc->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ using callback_t = void(std::tuple<std::error_code, int>);
+@@ -1106,7 +1210,15 @@ void NVMeSubsystem::startSanitize(
+ const NVMeSanitizeParams& params,
+ std::function<void(nvme_ex_ptr ex)>&& submitCb)
+ {
+- nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++ if (status != Status::Start)
++ {
++ throw sdbusplus::xyz::openbmc_project::Common::Error::Unavailable();
++ }
++
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
++ nvme_mi_ctrl_t ctrl = pc->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ intf->adminSanitize(ctrl, params.nvmeAction(), params.passes,
+@@ -1119,7 +1231,16 @@ void NVMeSubsystem::sanitizeStatus(
+ bool completed, uint16_t sstat, uint16_t sprog,
+ uint32_t scdw10)>&& cb)
+ {
+- nvme_mi_ctrl_t ctrl = getPrimaryController()->getMiCtrl();
++ if (status != Status::Start)
++ {
++ std::cerr << "Subsystem not in Start state, have " << static_cast<int>(status) << std::endl;
++ return;
++ }
++
++ auto pc = getPrimaryController();
++ assert(!pc->disabled());
++ nvme_mi_ctrl_t ctrl = pc->getMiCtrl();
++
+ auto intf = std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+
+ intf->adminGetLogPage(
+diff --git a/src/NVMeSubsys.hpp b/src/NVMeSubsys.hpp
+index a975f0b..c368274 100644
+--- a/src/NVMeSubsys.hpp
++++ b/src/NVMeSubsys.hpp
+@@ -77,12 +77,23 @@ class NVMeSubsystem :
+
+ NVMeIntf nvmeIntf;
+
++ /*
++ * stateDiagram
++ * [*] --> Stop
++ * Stop --> Intiatilzing
++ * Intiatilzing --> Start
++ * Intiatilzing --> Aborting
++ * Start --> Terminating
++ * Aborting --> Terminating
++ * Terminating --> Stop
++ */
+ enum class Status
+ {
+ Stop,
+ Intiatilzing,
+ Aborting,
+ Start,
++ Terminating,
+ };
+
+ Status status;
+--
+2.43.0.rc1.413.gea7ed67945-goog
+
diff --git a/recipes-phosphor/sensors/dbus-sensors_%.bbappend b/recipes-phosphor/sensors/dbus-sensors_%.bbappend
index eaa07d4..eca8ef3 100644
--- a/recipes-phosphor/sensors/dbus-sensors_%.bbappend
+++ b/recipes-phosphor/sensors/dbus-sensors_%.bbappend
@@ -92,6 +92,13 @@
file://0021-nvmesensor-use-processSecondaryControllerList-to-rep.patch \
file://0022-nvmesensor-refactor-createVolume-and-createNamespace.patch \
file://0023-nvmesensor-add-1s-timeout-for-mi-worker-condition_va.patch \
+ file://0025-nvmesensor-Add-getPrimaryController-helper.patch \
+ file://0026-NVMeDrive-Use-constexpr-to-avoid-storage-and-relocat.patch \
+ file://0027-NVMeSubsys-Uphold-safety-properties-in-markFunctiona.patch \
+ file://0028-NVMeController-Expose-disable-to-prevent-queueing-jo.patch \
+ file://0029-NVMeProgress-Expose-abort-for-cancelling-progress-ob.patch \
+ file://0030-NVMeMiIntf-Expose-flushOperations-to-track-MI-IO-job.patch \
+ file://0031-NVMeSubsys-Clean-up-invalidated-controllers-volumes-.patch \
"
PACKAGECONFIG[nvmesensor] = "-Dnvme=enabled, -Dnvme=disabled, libnvme"
SYSTEMD_SERVICE:${PN} += "${@bb.utils.contains('PACKAGECONFIG', 'nvmesensor', \