dbus-sensors: fix warm reboot for nvmesensor The warm reboot feature has an issue on 12 SSD machines. That is because the 3 ironfist Fru were reported at different but very close timestamp at boot time, which created an E-M config change jigger on dbus. Tested: b/290644786#comment5 Google-Bug-Id: 290644786 Change-Id: I85a127dff63f1f15d6240cd9cf48b9a323778cd0 Signed-off-by: Hao Jiang <jianghao@google.com> (cherry picked from commit 3656c001410f21ff33579dc43dcb696a5256566b)

commit: f383e4611f37165187584624c93400402b556124 [log] [tgz]
author: Hao Jiang <jianghao@google.com> Thu Jul 13 17:15:51 2023 -0700
committer: Hao Jiang <jianghao@google.com> Wed Jul 19 18:00:36 2023 +0000
tree: 41ff690a2113f53fe58148bf2e532c7f222b026f
parent: a50a8106d2cc65158fa14901426202681f017d1e [diff]
diff --git a/recipes-phosphor/sensors/dbus-sensors/0045-nvmesensor-improve-handling-of-config-change.patch b/recipes-phosphor/sensors/dbus-sensors/0045-nvmesensor-improve-handling-of-config-change.patch
new file mode 100644
index 0000000..d07c2c5
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0045-nvmesensor-improve-handling-of-config-change.patch

@@ -0,0 +1,112 @@
+From 0cb1ecae1582f24aca9104932996ef2e6156eb3b Mon Sep 17 00:00:00 2001
+From: Hao Jiang <jianghao@google.com>
+Date: Thu, 13 Jul 2023 19:51:03 +0000
+Subject: [PATCH 45/46] nvmesensor: improve handling of config change
+
+The improvement includes:
+1. Make sure there is only one `handleConfigurations` function running
+globally. That function is scheduled asynchroneously to read the
+config profiles from dbus. Given the reading delay is uncertain and
+could be possibly longer than the filter timer for property change
+events, there could be multiple `handleConfigurations` function being
+scheduling and stack in the asio queue. In this case, we only need to
+handle the latest config profile reading instead of all of them.
+
+2. Move the subsystem clean-up process ahead of the config profile
+reading and make a miminal interval of 5 seconds between the
+subsystem destroy and re-creation. That is because the lifetime the nvme
+subsystem is held asynchroneously by its internal polling loop. Stop the
+subsystem only sends a signal to break the loop but the actual resource
+recollection is not completed until all related asio context has been
+safely drained from the io queue. 5 seconds is a practise value based on
+1 second of polling rate and communication delay < 1s.
+
+Signed-off-by: Hao Jiang <jianghao@google.com>
+Change-Id: I5d029cef792806f0f2861ad03125acd63b6dc3c7
+---
+ src/NVMeSensorMain.cpp | 57 ++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 46 insertions(+), 11 deletions(-)
+
+diff --git a/src/NVMeSensorMain.cpp b/src/NVMeSensorMain.cpp
+index d6612f6..c380c7a 100644
+--- a/src/NVMeSensorMain.cpp
++++ b/src/NVMeSensorMain.cpp
+@@ -124,15 +124,6 @@ static void handleConfigurations(
+     std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+     const ManagedObjectType& nvmeConfigurations)
+ {
+-    // todo: it'd be better to only update the ones we care about
+-    for (const auto& [_, nvmeSubsys] : nvmeSubsysMap)
+-    {
+-        if (nvmeSubsys)
+-        {
+-            nvmeSubsys->stop();
+-        }
+-    }
+-    nvmeSubsysMap.clear();
+ 
+     /* We perform two iterations for configurations here. The first iteration is
+      * to set up NVMeIntf. The second iter is to setup NVMe subsystem.
+@@ -264,13 +255,57 @@ void createNVMeSubsystems(
+     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+     std::shared_ptr<sdbusplus::asio::connection>& dbusConnection)
+ {
++    // todo: it'd be better to only update the ones we care about
++    for (const auto& [_, nvmeSubsys] : nvmeSubsysMap)
++    {
++        if (nvmeSubsys)
++        {
++            nvmeSubsys->stop();
++        }
++    }
++    nvmeSubsysMap.clear();
++
++    static int count = 0;
++    static ManagedObjectType configs;
++    count += 2;
+ 
+     auto getter = std::make_shared<GetSensorConfiguration>(
+         dbusConnection, [&io, &objectServer, &dbusConnection](
+                             const ManagedObjectType& nvmeConfigurations) {
+-            handleConfigurations(io, objectServer, dbusConnection,
+-                                 nvmeConfigurations);
++            configs = nvmeConfigurations;
++            count--;
++            if (count == 0)
++            {
++                handleConfigurations(io, objectServer, dbusConnection, configs);
++            }
++            else
++            {
++                std::cerr << "more than one `handleConfigurations` has been "
++                             "scheduled, cancel the current one"
++                          << std::endl;
++            }
+         });
++    auto timer = std::make_shared<boost::asio::steady_timer>(
++        io, std::chrono::seconds(5));
++    timer->async_wait([&io, &objectServer, &dbusConnection,
++                       timer](const boost::system::error_code& ec) {
++        count--;
++        if (ec)
++        {
++            return;
++        }
++        if (count == 0)
++        {
++            handleConfigurations(io, objectServer, dbusConnection, configs);
++        }
++        else
++        {
++            std::cerr << "`handleConfigurations` has not been triggered, "
++                         "cancel the time"
++                      << std::endl;
++        }
++    });
++
+     getter->getConfiguration(
+         std::vector<std::string>{NVMeSubsystem::sensorType});
+ }
+-- 
+2.34.1
+

diff --git a/recipes-phosphor/sensors/dbus-sensors/0046-nvmesensor-release-resources-when-exit-poll.patch b/recipes-phosphor/sensors/dbus-sensors/0046-nvmesensor-release-resources-when-exit-poll.patch
new file mode 100644
index 0000000..19ac780
--- /dev/null
+++ b/recipes-phosphor/sensors/dbus-sensors/0046-nvmesensor-release-resources-when-exit-poll.patch

@@ -0,0 +1,208 @@
+From 49d9b4e938bfc1410668262201b0d2f7f0d267ce Mon Sep 17 00:00:00 2001
+From: Hao Jiang <jianghao@google.com>
+Date: Thu, 13 Jul 2023 20:47:29 +0000
+Subject: [PATCH 46/46] nvmesensor: release resources when exit poll
+
+When issuing stop to the poll loop, the loop should correctly break and
+release all resources it binds. Currently it has following issues:
+
+* The loop hold the shared_ptr of the timer, thus in some cases the
+  lifetime is held by the loop itself. Now the loop only holds the
+  weak_ptr and it will break when the timer is expired.
+
+* After cancel the poll timer, these tasks that 1. has been scheduled in
+  the io queue; 2. has been scheduled in the timer invocation queue,
+  will still be invoked. Added an extra check of the timer so: 1.
+  data_fetcher will not be scheduled after timer expiration; 2.
+  data_processor will receivded an operation_canceled error. This is to
+  make sure a data_fetcher will always invokes its data_processor.
+
+* Implicitly, the health poll will not update the status for the
+  cancelled operation after stop(). This is to prevent the last status
+  update re-enable the subsystem after stop().
+* The plugin now has been correctly created during start and destroyed
+  during stop.
+
+Signed-off-by: Hao Jiang <jianghao@google.com>
+Change-Id: I6b9f9646c0993849310cfa4059d70361c5907d98
+---
+ src/NVMeSubsys.cpp | 55 ++++++++++++++++++++++++++++++++--------------
+ src/NVMeUtil.hpp   | 19 +++++++++++-----
+ 2 files changed, 53 insertions(+), 21 deletions(-)
+
+diff --git a/src/NVMeSubsys.cpp b/src/NVMeSubsys.cpp
+index 63e3426..54f0da8 100644
+--- a/src/NVMeSubsys.cpp
++++ b/src/NVMeSubsys.cpp
+@@ -341,7 +341,8 @@ void NVMeSubsystem::start()
+         auto intf =
+             std::get<std::shared_ptr<NVMeBasicIntf>>(nvmeIntf.getInferface());
+         ctemp_fetch_t<NVMeBasicIntf::DriveStatus*> dataFether =
+-            [intf, self{std::move(shared_from_this())}](
++            [intf, self{std::move(shared_from_this())},
++             timer = std::weak_ptr<boost::asio::steady_timer>(ctempTimer)](
+                 std::function<void(const std::error_code&,
+                                    NVMeBasicIntf::DriveStatus*)>&& cb) {
+             /* Potentially defer sampling the sensor sensor if it is in error */
+@@ -355,8 +356,10 @@ void NVMeSubsystem::start()
+             intf->getStatus(std::move(cb));
+         };
+         ctemp_process_t<NVMeBasicIntf::DriveStatus*> dataProcessor =
+-            [self{shared_from_this()}](const std::error_code& error,
+-                                       NVMeBasicIntf::DriveStatus* status) {
++            [self{shared_from_this()},
++             timer = std::weak_ptr<boost::asio::steady_timer>(ctempTimer)](
++                const std::error_code& error,
++                NVMeBasicIntf::DriveStatus* status) {
+             // deferred sampling
+             if (error == std::errc::operation_canceled)
+             {
+@@ -408,7 +411,8 @@ void NVMeSubsystem::start()
+             std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
+ 
+         ctemp_fetch_t<nvme_mi_nvm_ss_health_status*> dataFether =
+-            [intf, self{std::move(shared_from_this())}](
++            [intf, self{std::move(shared_from_this())},
++             timer = std::weak_ptr<boost::asio::steady_timer>(ctempTimer)](
+                 std::function<void(const std::error_code&,
+                                    nvme_mi_nvm_ss_health_status*)>&& cb) {
+             // do not poll the health status if subsystem is in cooldown
+@@ -431,16 +435,24 @@ void NVMeSubsystem::start()
+             intf->miSubsystemHealthStatusPoll(std::move(cb));
+         };
+         ctemp_process_t<nvme_mi_nvm_ss_health_status*> dataProcessor =
+-            [self{shared_from_this()}](const std::error_code& error,
+-                                       nvme_mi_nvm_ss_health_status* status) {
++            [self{shared_from_this()},
++             timer = std::weak_ptr<boost::asio::steady_timer>(ctempTimer)](
++                const std::error_code& error,
++                nvme_mi_nvm_ss_health_status* status) {
+             if (self->UnavailableCount > 0)
+             {
+                 self->UnavailableCount--;
+                 return;
+             }
+ 
+-            if (error == std::errc::operation_canceled ||
+-                self->status == Status::Intiatilzing)
++            if (error == std::errc::operation_canceled)
++            {
++                std::cerr << "processing health data has been cancelled"
++                          << std::endl;
++                return;
++            }
++
++            if (self->status == Status::Intiatilzing)
+             {
+                 // on initialization, the subsystem will not update the status.
+                 std::cerr
+@@ -501,18 +513,19 @@ void NVMeSubsystem::start()
+ }
+ void NVMeSubsystem::stop()
+ {
+-    ctempTimer->cancel();
+-    if (status == Status::Start)
++    if (ctempTimer)
+     {
+-        std::cerr << "status start" << std::endl;
+-        markFunctional(false);
++        ctempTimer->cancel();
++        ctempTimer.reset();
+     }
+-    else if (status == Status::Intiatilzing)
++
++    if (status == Status::Intiatilzing)
+     {
+         std::cerr << "status init" << std::endl;
+-        ctempTimer->expires_after(std::chrono::milliseconds(100));
+-        ctempTimer->async_wait(
+-            [self{shared_from_this()}](boost::system::error_code ec) {
++        auto timer = std::make_shared<boost::asio::steady_timer>(
++            io, std::chrono::milliseconds(100));
++        timer->async_wait(
++            [self{shared_from_this()}, timer](boost::system::error_code ec) {
+             if (ec)
+             {
+                 return;
+@@ -520,4 +533,14 @@ void NVMeSubsystem::stop()
+             self->stop();
+         });
+     }
++    else
++    {
++        std::cerr << "status else" << std::endl;
++        markFunctional(false);
++    }
++
++    if (plugin)
++    {
++        plugin.reset();
++    }
+ }
+diff --git a/src/NVMeUtil.hpp b/src/NVMeUtil.hpp
+index 8639113..846917c 100644
+--- a/src/NVMeUtil.hpp
++++ b/src/NVMeUtil.hpp
+@@ -116,7 +116,7 @@ using ctemp_process_t =
+ 
+ template <class T>
+ void pollCtemp(
+-    std::shared_ptr<boost::asio::steady_timer> timer,
++    std::weak_ptr<boost::asio::steady_timer> timer,
+     std::chrono::duration<double, std::milli> delay,
+     const std::function<void(std::function<void(const std::error_code&, T)>&&)>&
+         dataFetcher,
+@@ -127,23 +127,31 @@ namespace detail
+ {
+ 
+ template <class T>
+-void updateCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
++void updateCtemp(std::weak_ptr<boost::asio::steady_timer> timer,
+                  std::chrono::duration<double, std::milli> delay,
+                  ctemp_process_t<T> dataProcessor, ctemp_fetch_t<T> dataFetcher,
+                  const std::error_code& error, T data)
+ {
++    if (timer.expired())
++    {
++        dataProcessor(std::make_error_code(std::errc::operation_canceled),
++                      data);
++        return;
++    }
++
+     dataProcessor(error, data);
+     ::pollCtemp(std::move(timer), std::move(delay), dataFetcher, dataProcessor);
+ }
+ 
+ template <class T>
+-void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
++void pollCtemp(std::weak_ptr<boost::asio::steady_timer> timer,
+                std::chrono::duration<double, std::milli> delay,
+                ctemp_fetch_t<T> dataFetcher, ctemp_process_t<T> dataProcessor,
+                const boost::system::error_code errorCode)
+ {
+-    if (errorCode == boost::asio::error::operation_aborted)
++    if (errorCode == boost::asio::error::operation_aborted || timer.expired())
+     {
++        std::cerr << "poll loop has been cancelled" << std::endl;
+         return;
+     }
+     if (errorCode)
+@@ -162,13 +170,14 @@ void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
+ 
+ template <class T>
+ void pollCtemp(
+-    std::shared_ptr<boost::asio::steady_timer> timer,
++    std::weak_ptr<boost::asio::steady_timer> weakTimer,
+     std::chrono::duration<double, std::milli> delay,
+     const std::function<void(std::function<void(const std::error_code&, T)>&&)>&
+         dataFetcher,
+     const std::function<void(const std::error_code& error, T data)>&
+         dataProcessor)
+ {
++    auto timer = weakTimer.lock();
+     if (!timer)
+     {
+         return;
+-- 
+2.34.1
+

diff --git a/recipes-phosphor/sensors/dbus-sensors_%.bbappend b/recipes-phosphor/sensors/dbus-sensors_%.bbappend
index 228f58f..fa966d0 100644
--- a/recipes-phosphor/sensors/dbus-sensors_%.bbappend
+++ b/recipes-phosphor/sensors/dbus-sensors_%.bbappend

@@ -57,6 +57,8 @@
   file://0042-nvmesensor-chuck-fetch-for-telemetry-logs.patch \
   file://0043-nvmesensor-fix-plugin-issue.patch \
   file://0044-nvmesensor-add-markAvailable-with-adaptive-timer.patch \
+  file://0045-nvmesensor-improve-handling-of-config-change.patch \
+  file://0046-nvmesensor-release-resources-when-exit-poll.patch \
 "
 PACKAGECONFIG[nvmesensor] = "-Dnvme=enabled, -Dnvme=disabled, libnvme libnvme-vu"
 SYSTEMD_SERVICE:${PN} += "${@bb.utils.contains('PACKAGECONFIG', 'nvmesensor', \
commit	f383e4611f37165187584624c93400402b556124	[log] [tgz]
author	Hao Jiang <jianghao@google.com>	Thu Jul 13 17:15:51 2023 -0700
committer	Hao Jiang <jianghao@google.com>	Wed Jul 19 18:00:36 2023 +0000
tree	41ff690a2113f53fe58148bf2e532c7f222b026f
parent	a50a8106d2cc65158fa14901426202681f017d1e [diff]