blob: 7325df937b9afd8b3ac08ac4b43cc0d088953ebd [file] [log] [blame] [edit]
From f89349b1a54b3e5bb6dcb3ddd027f90543c9a46e Mon Sep 17 00:00:00 2001
From: Hao Jiang <jianghao@google.com>
Date: Thu, 20 Apr 2023 20:48:54 +0000
Subject: [PATCH 32/44] nvmesensor: subsystem poll supports absence
Enhance the polling function to support more symptoms. For example, the
endpoint absence. The signal comes from NVMeMi class because the
absence detection was done by transporation layer(mctp or i2c binding),
instead of the NVMe-MI protocol layer.
The enhancement will be furthur utilized by other symptoms provided by
NSHDS (NVM Subsystem Health Data Structure), including but not limited
to SW, CCS.CSTS, CCS.NAC, etc.
Tested: passed the unit test of TestDriveAbsent with increase polling
rate of 50 ms.
Signed-off-by: Hao Jiang <jianghao@google.com>
Change-Id: I5efe7c07e07630064b5e6e7fd707f3c286673bba
---
src/NVMeBasic.cpp | 13 +---
src/NVMeIntf.hpp | 1 +
src/NVMeSubsys.cpp | 146 +++++++++++++++++++++++++++++++++--------
src/NVMeSubsys.hpp | 1 -
src/NVMeUtil.hpp | 92 ++++++++------------------
tests/test_nvme_mi.cpp | 95 ++++++++++++++++++++++++++-
6 files changed, 242 insertions(+), 106 deletions(-)
diff --git a/src/NVMeBasic.cpp b/src/NVMeBasic.cpp
index 3a2d4ca..d18aed4 100644
--- a/src/NVMeBasic.cpp
+++ b/src/NVMeBasic.cpp
@@ -342,23 +342,12 @@ void NVMeBasic::getStatus(
/* Process the callback */
self->io.post([data, cb{std::move(cb)}]() {
- if (data->size() <
- sizeof(DriveStatus) + 1) // The first byte is status flags
+ if (data->size() < sizeof(DriveStatus))
{
cb(std::make_error_code(std::errc::message_size), nullptr);
return;
}
- uint8_t flags = static_cast<uint8_t>(data->front());
- if (((flags & NVME_MI_BASIC_SFLGS_DRIVE_NOT_READY) != 0) ||
- ((flags & NVME_MI_BASIC_SFLGS_DRIVE_FUNCTIONAL) == 0))
- {
- cb(std::make_error_code(std::errc::no_such_device),
- nullptr);
- return;
- }
-
- data->erase(data->begin());
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
cb({}, reinterpret_cast<DriveStatus*>(data->data()));
});
diff --git a/src/NVMeIntf.hpp b/src/NVMeIntf.hpp
index e52a700..f87b536 100644
--- a/src/NVMeIntf.hpp
+++ b/src/NVMeIntf.hpp
@@ -78,6 +78,7 @@ class NVMeBasicIntf
public:
struct DriveStatus
{
+ uint8_t Status;
uint8_t SmartWarnings;
uint8_t Temp;
uint8_t DriveLifeUsed;
diff --git a/src/NVMeSubsys.cpp b/src/NVMeSubsys.cpp
index 10b360d..c605e30 100644
--- a/src/NVMeSubsys.cpp
+++ b/src/NVMeSubsys.cpp
@@ -1,6 +1,7 @@
#include "NVMeSubsys.hpp"
#include "NVMePlugin.hpp"
+#include "NVMeUtil.hpp"
#include "Thresholds.hpp"
#include <filesystem>
@@ -303,60 +304,153 @@ void NVMeSubsystem::start()
{
auto intf =
std::get<std::shared_ptr<NVMeBasicIntf>>(nvmeIntf.getInferface());
- ctemp_fetcher_t<NVMeBasicIntf::DriveStatus*> dataFether =
- [intf](std::function<void(const std::error_code&,
- NVMeBasicIntf::DriveStatus*)>&& cb) {
+ ctemp_fetch_t<NVMeBasicIntf::DriveStatus*> dataFether =
+ [intf, self{std::move(shared_from_this())}](
+ std::function<void(const std::error_code&,
+ NVMeBasicIntf::DriveStatus*)>&& cb) {
+ /* Potentially defer sampling the sensor sensor if it is in error */
+ if (!self->ctemp->sample())
+ {
+ cb(std::make_error_code(std::errc::operation_canceled),
+ nullptr);
+ return;
+ }
+
intf->getStatus(std::move(cb));
};
- ctemp_parser_t<NVMeBasicIntf::DriveStatus*> dataParser =
- [](NVMeBasicIntf::DriveStatus* status) -> std::optional<double> {
+ ctemp_process_t<NVMeBasicIntf::DriveStatus*> dataProcessor =
+ [self{shared_from_this()}](const std::error_code& error,
+ NVMeBasicIntf::DriveStatus* status) {
+ // deferred sampling
+ if (error == std::errc::operation_canceled)
+ {
+ return;
+ }
+ // The device is physically absent
+ else if (error == std::errc::no_such_device)
+ {
+ std::cerr << "error reading ctemp from subsystem"
+ << ", reason:" << error.message() << "\n";
+ self->ctemp->markFunctional(false);
+ self->ctemp->markAvailable(false);
+ return;
+ }
+ // other communication errors
+ else if (error)
+ {
+ std::cerr << "error reading ctemp from subsystem"
+ << ", reason:" << error.message() << "\n";
+ self->ctemp->incrementError();
+ return;
+ }
+
if (status == nullptr)
{
- return std::nullopt;
+ std::cerr << "empty data returned by data fetcher" << std::endl;
+ self->ctemp->markFunctional(false);
+ return;
}
- return {getTemperatureReading(status->Temp)};
+
+ uint8_t flags = status->Status;
+ if (((flags & NVMeBasicIntf::StatusFlags::
+ NVME_MI_BASIC_SFLGS_DRIVE_NOT_READY) != 0) ||
+ ((flags & NVMeBasicIntf::StatusFlags::
+ NVME_MI_BASIC_SFLGS_DRIVE_FUNCTIONAL) == 0))
+ {
+ self->ctemp->markFunctional(false);
+ return;
+ }
+ self->ctemp->updateValue(getTemperatureReading(status->Temp));
};
- pollCtemp(this->ctempTimer, this->ctemp, dataFether, dataParser);
+
+ pollCtemp(ctempTimer, std::chrono::seconds(1), dataFether,
+ dataProcessor);
}
else if (nvmeIntf.getProtocol() == NVMeIntf::Protocol::NVMeMI)
{
auto intf =
std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface());
- ctemp_fetcher_t<nvme_mi_nvm_ss_health_status*> dataFether =
- [intf](std::function<void(const std::error_code&,
- nvme_mi_nvm_ss_health_status*)>&& cb) {
+ ctemp_fetch_t<nvme_mi_nvm_ss_health_status*> dataFether =
+ [intf, self{std::move(shared_from_this())}](
+ std::function<void(const std::error_code&,
+ nvme_mi_nvm_ss_health_status*)>&& cb) {
+ // do not poll the health status if the subsystem is intiatilzing
+ if (self->status == Status::Intiatilzing)
+ {
+ std::cerr << "subsystem is intiatilzing, cancel the health poll"
+ << std::endl;
+ cb(std::make_error_code(std::errc::operation_canceled),
+ nullptr);
+ return;
+ }
intf->miSubsystemHealthStatusPoll(std::move(cb));
};
- ctemp_parser_t<nvme_mi_nvm_ss_health_status*> dataParser =
- [self{shared_from_this()}](
- nvme_mi_nvm_ss_health_status* status) -> std::optional<double> {
- // Drive Functional
- bool df = status->nss & 0x20;
- if (!df)
+ ctemp_process_t<nvme_mi_nvm_ss_health_status*> dataProcessor =
+ [self{shared_from_this()}](const std::error_code& error,
+ nvme_mi_nvm_ss_health_status* status) {
+ if (error == std::errc::operation_canceled ||
+ self->status == Status::Intiatilzing)
{
+ // on initialization, the subsystem will not update the status.
+ std::cerr
+ << "subsystem is intiatilzing, do not process the status"
+ << std::endl;
+ return;
+ }
+
+ if (error == std::errc::no_such_device)
+ {
+ std::cerr << "error reading ctemp "
+ "from subsystem"
+ << ", reason:" << error.message() << "\n";
// stop the subsystem
- try
+ self->markFunctional(false);
+ self->ctemp->markFunctional(false);
+ self->ctemp->markAvailable(false);
+
+ return;
+ }
+ else if (error)
+ {
+ std::cerr << "error reading ctemp "
+ "from subsystem"
+ << ", reason:" << error.message() << "\n";
+ self->ctemp->incrementError();
+ if (self->ctemp->inError())
{
self->markFunctional(false);
self->ctemp->markFunctional(false);
}
- catch (const std::runtime_error& e)
- {
- std::cerr << e.what() << std::endl;
- }
- return std::nullopt;
+ return;
+ }
+
+ // Drive Functional
+ bool df = status->nss & 0x20;
+ if (!df)
+ {
+ // stop the subsystem
+
+ self->markFunctional(false);
+ self->ctemp->markFunctional(false);
+
+ return;
}
// restart the subsystem
if (self->status == Status::Stop)
{
self->markFunctional(true);
- self->ctemp->markFunctional(true);
}
- return {getTemperatureReading(status->ctemp)};
+
+ // TODO: update the drive interface
+
+ self->ctemp->updateValue(getTemperatureReading(status->ctemp));
+ return;
};
- pollCtemp(ctempTimer, ctemp, dataFether, dataParser);
+
+ pollCtemp(ctempTimer, std::chrono::seconds(1), dataFether,
+ dataProcessor);
}
}
void NVMeSubsystem::stop()
diff --git a/src/NVMeSubsys.hpp b/src/NVMeSubsys.hpp
index e5db58a..6bd8b62 100644
--- a/src/NVMeSubsys.hpp
+++ b/src/NVMeSubsys.hpp
@@ -4,7 +4,6 @@
#include "NVMeDrive.hpp"
#include "NVMeSensor.hpp"
#include "NVMeStorage.hpp"
-#include "NVMeUtil.hpp"
#include "Utils.hpp"
class NVMeControllerPlugin;
diff --git a/src/NVMeUtil.hpp b/src/NVMeUtil.hpp
index 16fe291..8639113 100644
--- a/src/NVMeUtil.hpp
+++ b/src/NVMeUtil.hpp
@@ -7,6 +7,7 @@
#include <filesystem>
#include <iostream>
#include <optional>
+#include <system_error>
inline std::filesystem::path deriveRootBusPath(int busNumber)
{
@@ -104,60 +105,43 @@ inline std::optional<std::string>
// Function type for fetching ctemp which encaplucated in a structure of T.
// The fetcher function take a callback as input to process the result.
template <class T>
-using ctemp_fetcher_t =
+using ctemp_fetch_t =
std::function<void(std::function<void(const std::error_code&, T)>&&)>;
-// Function type for parsing ctemp out the structure of type T.
-// The parser function will return the value of ctemp or nullopt on failure.
+// Function type for processing ctemp out the structure of type T.
+// The process function will update the properties based on input data.
template <class T>
-using ctemp_parser_t = std::function<std::optional<double>(T data)>;
+using ctemp_process_t =
+ std::function<void(const std::error_code& error, T data)>;
template <class T>
void pollCtemp(
std::shared_ptr<boost::asio::steady_timer> timer,
- std::shared_ptr<NVMeSensor> sensor,
+ std::chrono::duration<double, std::milli> delay,
const std::function<void(std::function<void(const std::error_code&, T)>&&)>&
dataFetcher,
- const std::function<std::optional<double>(T data)>& dataParser);
+ const std::function<void(const std::error_code& error, T data)>&
+ dataProcessor);
namespace detail
{
template <class T>
void updateCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
- std::shared_ptr<NVMeSensor> sensor,
- ctemp_parser_t<T> dataParser, ctemp_fetcher_t<T> dataFetcher,
- const boost::system::error_code error, T data)
+ std::chrono::duration<double, std::milli> delay,
+ ctemp_process_t<T> dataProcessor, ctemp_fetch_t<T> dataFetcher,
+ const std::error_code& error, T data)
{
- if (error)
- {
- std::cerr << "error reading ctemp from subsystem"
- << ", reason:" << error.message() << "\n";
- sensor->markFunctional(false);
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher,
- dataParser);
- return;
- }
- auto value = dataParser(data);
- if (!value)
- {
- sensor->incrementError();
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher,
- dataParser);
- return;
- }
-
- sensor->updateValue(*value);
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, dataParser);
+ dataProcessor(error, data);
+ ::pollCtemp(std::move(timer), std::move(delay), dataFetcher, dataProcessor);
}
template <class T>
void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
- std::shared_ptr<NVMeSensor> sensor,
- ctemp_fetcher_t<T> dataFetcher, ctemp_parser_t<T> dataParser,
+ std::chrono::duration<double, std::milli> delay,
+ ctemp_fetch_t<T> dataFetcher, ctemp_process_t<T> dataProcessor,
const boost::system::error_code errorCode)
{
-
if (errorCode == boost::asio::error::operation_aborted)
{
return;
@@ -165,37 +149,13 @@ void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
if (errorCode)
{
std::cerr << errorCode.message() << "\n";
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher,
- dataParser);
- return;
- }
-
- if (!sensor)
- {
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher,
- dataParser);
- return;
- }
-
- if (!sensor->readingStateGood())
- {
- sensor->markAvailable(false);
- sensor->updateValue(std::numeric_limits<double>::quiet_NaN());
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher,
- dataParser);
- return;
- }
-
- /* Potentially defer sampling the sensor sensor if it is in error */
- if (!sensor->sample())
- {
- ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher,
- dataParser);
+ ::pollCtemp(std::move(timer), std::move(delay), dataFetcher,
+ dataProcessor);
return;
}
dataFetcher(std::bind_front(detail::updateCtemp<T>, std::move(timer),
- std::move(sensor), dataParser, dataFetcher));
+ std::move(delay), dataProcessor, dataFetcher));
}
} // namespace detail
@@ -203,17 +163,19 @@ void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer,
template <class T>
void pollCtemp(
std::shared_ptr<boost::asio::steady_timer> timer,
- std::shared_ptr<NVMeSensor> sensor,
+ std::chrono::duration<double, std::milli> delay,
const std::function<void(std::function<void(const std::error_code&, T)>&&)>&
dataFetcher,
- const std::function<std::optional<double>(T data)>& dataParser)
+ const std::function<void(const std::error_code& error, T data)>&
+ dataProcessor)
{
- if (!timer && !sensor)
+ if (!timer)
{
return;
}
- timer->expires_from_now(std::chrono::seconds(1));
+ timer->expires_from_now(
+ std::chrono::duration_cast<std::chrono::milliseconds>(delay));
timer->async_wait(std::bind_front(detail::pollCtemp<T>, std::move(timer),
- std::move(sensor), dataFetcher,
- dataParser));
+ std::move(delay), dataFetcher,
+ dataProcessor));
}
diff --git a/tests/test_nvme_mi.cpp b/tests/test_nvme_mi.cpp
index 581f20b..8462d5e 100644
--- a/tests/test_nvme_mi.cpp
+++ b/tests/test_nvme_mi.cpp
@@ -228,7 +228,8 @@ TEST_F(NVMeTest, TestDriveFunctional)
[&](std::function<void(const std::error_code&,
nvme_mi_nvm_ss_health_status*)>&&
cb) {
- std::cerr << "mock health poll" << std::endl;
+ std::cerr << "mock device not functional health poll"
+ << std::endl;
// return status.nss.df = 0
return io.post([cb = std::move(cb)]() {
nvme_mi_nvm_ss_health_status status;
@@ -264,7 +265,97 @@ TEST_F(NVMeTest, TestDriveFunctional)
EXPECT_EQ(result.size(), 2);
subsys->stop();
- io.stop();
+ io.post([&]() { io.stop(); });
+ },
+ "xyz.openbmc_project.ObjectMapper",
+ "/xyz/openbmc_project/object_mapper",
+ "xyz.openbmc_project.ObjectMapper", "GetSubTree",
+ subsys_path, 0,
+ std::vector<std::string>{
+ "xyz.openbmc_project.Inventory."
+ "Item.StorageController"});
+ });
+ },
+ "xyz.openbmc_project.ObjectMapper",
+ "/xyz/openbmc_project/object_mapper",
+ "xyz.openbmc_project.ObjectMapper", "GetSubTree",
+ subsys_path, 0,
+ std::vector<std::string>{"xyz.openbmc_project.Inventory."
+ "Item.StorageController"});
+ });
+ },
+ "xyz.openbmc_project.ObjectMapper",
+ "/xyz/openbmc_project/object_mapper",
+ "xyz.openbmc_project.ObjectMapper", "GetSubTree", subsys_path, 0,
+ std::vector<std::string>{
+ "xyz.openbmc_project.Inventory.Item.StorageController"});
+ });
+ io.run();
+}
+
+/**
+ * @brief Test NVMeMi returns Drive is absent (ec = no_such_device)
+ *
+ */
+TEST_F(NVMeTest, TestDriveAbsent)
+{
+ using ::testing::AtLeast;
+ boost::asio::steady_timer timer(io);
+
+ EXPECT_CALL(mock, miSubsystemHealthStatusPoll).Times(AtLeast(1));
+ EXPECT_CALL(mock, adminIdentify).Times(AtLeast(1));
+ EXPECT_CALL(mock, miScanCtrl).Times(AtLeast(1));
+
+ // wait for subsystem initialization
+ timer.expires_after(std::chrono::seconds(2));
+ timer.async_wait([&](boost::system::error_code) {
+ system_bus->async_method_call(
+ [&](boost::system::error_code, const GetSubTreeType& result) {
+ // Only PF and the enabled VF should be listed
+ EXPECT_EQ(result.size(), 2);
+
+ // mimik communication error of NVMeMI request
+ ON_CALL(mock, miSubsystemHealthStatusPoll)
+ .WillByDefault(
+ [&](std::function<void(const std::error_code&,
+ nvme_mi_nvm_ss_health_status*)>&&
+ cb) {
+ std::cerr << "mock device absent health poll" << std::endl;
+ // return no_such_device
+ return io.post([cb = std::move(cb)]() {
+ cb(std::make_error_code(std::errc::no_such_device),
+ nullptr);
+ });
+ });
+
+ // wait for storage controller destruction.
+ timer.expires_after(std::chrono::seconds(2));
+ timer.async_wait([&](boost::system::error_code) {
+ system_bus->async_method_call(
+ [&](boost::system::error_code,
+ const GetSubTreeType& result) {
+ // no storage controller should be listed.
+ EXPECT_EQ(result.size(), 0);
+
+ // restart sending normal polling result
+ ON_CALL(mock, miSubsystemHealthStatusPoll)
+ .WillByDefault(
+ [&](std::function<void(
+ const std::error_code&,
+ nvme_mi_nvm_ss_health_status*)>&& cb) {
+ return mock.fake->miSubsystemHealthStatusPoll(
+ std::move(cb));
+ });
+ timer.expires_after(std::chrono::seconds(2));
+ timer.async_wait([&](boost::system::error_code) {
+ system_bus->async_method_call(
+ [&](boost::system::error_code,
+ const GetSubTreeType& result) {
+ // storage controller should be restored.
+ EXPECT_EQ(result.size(), 2);
+
+ subsys->stop();
+ io.post([&]() { io.stop(); });
},
"xyz.openbmc_project.ObjectMapper",
"/xyz/openbmc_project/object_mapper",
--
2.34.1