NVMeMI: Cancel the Timer event if waiting to be schedule Previously, if the MI layer takes time to come up. The optimization loop will keep on running. In the meantime, if Power_stop signal is received. It will reset the nvmeEP resulting in segmentation fault in the Timer callback. Tested=Manually with loop of power cycles * Stressing with NVMe-MI layer working * Injecting error so MI layer always error out Google-Bug-Id: 372696551 Change-Id: Ic2c593481b5bea0ba8d00e5544b6bcdc6bdaa124 Signed-off-by: Muhammad Usama <muhammadusama@google.com>

commit: 8f5c409822967f9145ad1b8063326c44019d6a01 [log] [tgz]
author: Muhammad Usama <muhammadusama@google.com> Thu Oct 17 02:37:06 2024 +0000
committer: Muhammad Usama Chaudhry <muhammadusama@google.com> Thu Oct 17 18:36:44 2024 +0000
tree: 7f63ac7f6e344f69ed44f555b3e16956938dac14
parent: c989b773eaa3c27908440b9f2812edfddd5e83d3 [diff]
diff --git a/src/NVMeMi.cpp b/src/NVMeMi.cpp
index 2aff3df..775e6fd 100644
--- a/src/NVMeMi.cpp
+++ b/src/NVMeMi.cpp

@@ -27,7 +27,7 @@
                int addr, bool singleThreadMode, PowerState readState) :
     io(io), conn(conn), dbus(*conn.get()), bus(bus), addr(addr),
     readState(readState), nvmeEP(nullptr), nid(-1), eid(0), mtu(64),
-    startLoopRunning(false), mctpStatus(Status::Reset)
+    optimizeTimer(nullptr), mctpStatus(Status::Reset)
 {
     // set update the worker thread
     if (!nvmeRoot)
@@ -82,7 +82,7 @@
 void NVMeMi::start()
 {
     // Already in start loop
-    if (startLoopRunning)
+    if (optimizeTimer)
     {
         return;
     }
@@ -161,20 +161,18 @@
 
     if (mctpStatus == Status::Initiated)
     {
-        startLoopRunning = true;
-        auto timer = std::make_shared<boost::asio::steady_timer>(
+        optimizeTimer = std::make_shared<boost::asio::steady_timer>(
             io, std::chrono::milliseconds(500));
-        timer->async_wait([this, timer](boost::system::error_code ec) {
+        optimizeTimer->async_wait([this](boost::system::error_code ec) {
             if (ec)
             {
-                startLoopRunning = false;
+                std::cerr << "Endpoint optimize timer error " << ec
+                          << std::endl;
                 return;
             }
-            unsigned timeout = nvme_mi_ep_get_timeout(nvmeEP);
-            nvme_mi_ep_set_timeout(nvmeEP, initCmdTimeoutMS);
             miSetMCTPConfiguration(
-                [timeout, self{shared_from_this()}](const std::error_code& ec) {
-                nvme_mi_ep_set_timeout(self->nvmeEP, timeout);
+                [self{shared_from_this()}](const std::error_code& ec) {
+                self->optimizeTimer = nullptr;
                 if (ec)
                 {
                     std::cerr << "[bus: " << self->bus
@@ -183,18 +181,15 @@
                               << std::to_string(self->nid) + ":" +
                                      std::to_string(self->eid)
                               << std::endl;
-                    self->startLoopRunning = false;
                     self->start();
                     return;
                 }
                 auto rc = self->configureLocalRouteMtu();
                 if (rc)
                 {
-                    self->startLoopRunning = false;
                     self->start();
                     return;
                 }
-                self->startLoopRunning = false;
                 self->mctpStatus = Status::Connected;
             });
         });
@@ -207,6 +202,15 @@
     {
         return;
     }
+
+    if (optimizeTimer)
+    {
+        std::cerr << "[ bus: " << bus << ", addr: " << addr << "]"
+                  << "Cancel the optimization Timer for the endpoint"
+                  << std::endl;
+        optimizeTimer->cancel();
+        optimizeTimer = nullptr;
+    }
     // each nvme mi message transaction should take relatively short time
     // (typically <= 200 ms). So the blocking time should be short
     std::unique_lock<std::mutex> lock(mctpMtx);
@@ -348,8 +352,11 @@
         post([port_id, max_supported_freq, self{shared_from_this()},
               cb{std::move(cb)}]() mutable {
             enum nvme_mi_config_smbus_freq smbusFreq;
+            unsigned timeout = nvme_mi_ep_get_timeout(self->nvmeEP);
+            nvme_mi_ep_set_timeout(self->nvmeEP, initCmdTimeoutMS);
             auto rc = nvme_mi_mi_config_get_smbus_freq(self->nvmeEP, port_id,
                                                        &smbusFreq);
+            nvme_mi_ep_set_timeout(self->nvmeEP, timeout);
             if (rc)
             {
                 std::cerr << "[bus: " << self->bus << ", addr: " << self->addr
@@ -409,7 +416,10 @@
     {
         post([port, mtu, max_supported_freq, self{shared_from_this()},
               cb{std::move(cb)}]() mutable {
+            unsigned timeout = nvme_mi_ep_get_timeout(self->nvmeEP);
+            nvme_mi_ep_set_timeout(self->nvmeEP, initCmdTimeoutMS);
             auto rc = nvme_mi_mi_config_set_mctp_mtu(self->nvmeEP, port, mtu);
+            nvme_mi_ep_set_timeout(self->nvmeEP, timeout);
             if (rc)
             {
                 std::cerr << "[bus: " << self->bus << ", addr: " << self->addr
@@ -462,8 +472,11 @@
     try
     {
         post([cb{std::move(cb)}, self{shared_from_this()}]() mutable {
+            unsigned timeout = nvme_mi_ep_get_timeout(self->nvmeEP);
+            nvme_mi_ep_set_timeout(self->nvmeEP, initCmdTimeoutMS);
             struct nvme_mi_read_nvm_ss_info ssInfo;
             auto rc = nvme_mi_mi_read_mi_data_subsys(self->nvmeEP, &ssInfo);
+            nvme_mi_ep_set_timeout(self->nvmeEP, timeout);
             if (rc)
             {
                 std::cerr << "Failed reading subsystem info failing: "
@@ -473,7 +486,6 @@
                 });
                 return;
             }
-
             for (uint8_t port_id = 0; port_id <= ssInfo.nump; port_id++)
             {
                 struct nvme_mi_read_port_info portInfo;

diff --git a/src/NVMeMi.hpp b/src/NVMeMi.hpp
index d5c41a8..6dca9e0 100644
--- a/src/NVMeMi.hpp
+++ b/src/NVMeMi.hpp

@@ -160,7 +160,7 @@
         Connected,
     };
 
-    bool startLoopRunning;
+    std::shared_ptr<boost::asio::steady_timer> optimizeTimer;
 
     Status mctpStatus;
commit	8f5c409822967f9145ad1b8063326c44019d6a01	[log] [tgz]
author	Muhammad Usama <muhammadusama@google.com>	Thu Oct 17 02:37:06 2024 +0000
committer	Muhammad Usama Chaudhry <muhammadusama@google.com>	Thu Oct 17 18:36:44 2024 +0000
tree	7f63ac7f6e344f69ed44f555b3e16956938dac14
parent	c989b773eaa3c27908440b9f2812edfddd5e83d3 [diff]