| From f89349b1a54b3e5bb6dcb3ddd027f90543c9a46e Mon Sep 17 00:00:00 2001 |
| From: Hao Jiang <jianghao@google.com> |
| Date: Thu, 20 Apr 2023 20:48:54 +0000 |
| Subject: [PATCH 32/44] nvmesensor: subsystem poll supports absence |
| |
| Enhance the polling function to support more symptoms. For example, the |
| endpoint absence. The signal comes from NVMeMi class because the |
| absence detection was done by transporation layer(mctp or i2c binding), |
| instead of the NVMe-MI protocol layer. |
| |
| The enhancement will be furthur utilized by other symptoms provided by |
| NSHDS (NVM Subsystem Health Data Structure), including but not limited |
| to SW, CCS.CSTS, CCS.NAC, etc. |
| |
| Tested: passed the unit test of TestDriveAbsent with increase polling |
| rate of 50 ms. |
| |
| Signed-off-by: Hao Jiang <jianghao@google.com> |
| Change-Id: I5efe7c07e07630064b5e6e7fd707f3c286673bba |
| --- |
| src/NVMeBasic.cpp | 13 +--- |
| src/NVMeIntf.hpp | 1 + |
| src/NVMeSubsys.cpp | 146 +++++++++++++++++++++++++++++++++-------- |
| src/NVMeSubsys.hpp | 1 - |
| src/NVMeUtil.hpp | 92 ++++++++------------------ |
| tests/test_nvme_mi.cpp | 95 ++++++++++++++++++++++++++- |
| 6 files changed, 242 insertions(+), 106 deletions(-) |
| |
| diff --git a/src/NVMeBasic.cpp b/src/NVMeBasic.cpp |
| index 3a2d4ca..d18aed4 100644 |
| --- a/src/NVMeBasic.cpp |
| +++ b/src/NVMeBasic.cpp |
| @@ -342,23 +342,12 @@ void NVMeBasic::getStatus( |
| |
| /* Process the callback */ |
| self->io.post([data, cb{std::move(cb)}]() { |
| - if (data->size() < |
| - sizeof(DriveStatus) + 1) // The first byte is status flags |
| + if (data->size() < sizeof(DriveStatus)) |
| { |
| cb(std::make_error_code(std::errc::message_size), nullptr); |
| return; |
| } |
| |
| - uint8_t flags = static_cast<uint8_t>(data->front()); |
| - if (((flags & NVME_MI_BASIC_SFLGS_DRIVE_NOT_READY) != 0) || |
| - ((flags & NVME_MI_BASIC_SFLGS_DRIVE_FUNCTIONAL) == 0)) |
| - { |
| - cb(std::make_error_code(std::errc::no_such_device), |
| - nullptr); |
| - return; |
| - } |
| - |
| - data->erase(data->begin()); |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) |
| cb({}, reinterpret_cast<DriveStatus*>(data->data())); |
| }); |
| diff --git a/src/NVMeIntf.hpp b/src/NVMeIntf.hpp |
| index e52a700..f87b536 100644 |
| --- a/src/NVMeIntf.hpp |
| +++ b/src/NVMeIntf.hpp |
| @@ -78,6 +78,7 @@ class NVMeBasicIntf |
| public: |
| struct DriveStatus |
| { |
| + uint8_t Status; |
| uint8_t SmartWarnings; |
| uint8_t Temp; |
| uint8_t DriveLifeUsed; |
| diff --git a/src/NVMeSubsys.cpp b/src/NVMeSubsys.cpp |
| index 10b360d..c605e30 100644 |
| --- a/src/NVMeSubsys.cpp |
| +++ b/src/NVMeSubsys.cpp |
| @@ -1,6 +1,7 @@ |
| #include "NVMeSubsys.hpp" |
| |
| #include "NVMePlugin.hpp" |
| +#include "NVMeUtil.hpp" |
| #include "Thresholds.hpp" |
| |
| #include <filesystem> |
| @@ -303,60 +304,153 @@ void NVMeSubsystem::start() |
| { |
| auto intf = |
| std::get<std::shared_ptr<NVMeBasicIntf>>(nvmeIntf.getInferface()); |
| - ctemp_fetcher_t<NVMeBasicIntf::DriveStatus*> dataFether = |
| - [intf](std::function<void(const std::error_code&, |
| - NVMeBasicIntf::DriveStatus*)>&& cb) { |
| + ctemp_fetch_t<NVMeBasicIntf::DriveStatus*> dataFether = |
| + [intf, self{std::move(shared_from_this())}]( |
| + std::function<void(const std::error_code&, |
| + NVMeBasicIntf::DriveStatus*)>&& cb) { |
| + /* Potentially defer sampling the sensor sensor if it is in error */ |
| + if (!self->ctemp->sample()) |
| + { |
| + cb(std::make_error_code(std::errc::operation_canceled), |
| + nullptr); |
| + return; |
| + } |
| + |
| intf->getStatus(std::move(cb)); |
| }; |
| - ctemp_parser_t<NVMeBasicIntf::DriveStatus*> dataParser = |
| - [](NVMeBasicIntf::DriveStatus* status) -> std::optional<double> { |
| + ctemp_process_t<NVMeBasicIntf::DriveStatus*> dataProcessor = |
| + [self{shared_from_this()}](const std::error_code& error, |
| + NVMeBasicIntf::DriveStatus* status) { |
| + // deferred sampling |
| + if (error == std::errc::operation_canceled) |
| + { |
| + return; |
| + } |
| + // The device is physically absent |
| + else if (error == std::errc::no_such_device) |
| + { |
| + std::cerr << "error reading ctemp from subsystem" |
| + << ", reason:" << error.message() << "\n"; |
| + self->ctemp->markFunctional(false); |
| + self->ctemp->markAvailable(false); |
| + return; |
| + } |
| + // other communication errors |
| + else if (error) |
| + { |
| + std::cerr << "error reading ctemp from subsystem" |
| + << ", reason:" << error.message() << "\n"; |
| + self->ctemp->incrementError(); |
| + return; |
| + } |
| + |
| if (status == nullptr) |
| { |
| - return std::nullopt; |
| + std::cerr << "empty data returned by data fetcher" << std::endl; |
| + self->ctemp->markFunctional(false); |
| + return; |
| } |
| - return {getTemperatureReading(status->Temp)}; |
| + |
| + uint8_t flags = status->Status; |
| + if (((flags & NVMeBasicIntf::StatusFlags:: |
| + NVME_MI_BASIC_SFLGS_DRIVE_NOT_READY) != 0) || |
| + ((flags & NVMeBasicIntf::StatusFlags:: |
| + NVME_MI_BASIC_SFLGS_DRIVE_FUNCTIONAL) == 0)) |
| + { |
| + self->ctemp->markFunctional(false); |
| + return; |
| + } |
| + self->ctemp->updateValue(getTemperatureReading(status->Temp)); |
| }; |
| - pollCtemp(this->ctempTimer, this->ctemp, dataFether, dataParser); |
| + |
| + pollCtemp(ctempTimer, std::chrono::seconds(1), dataFether, |
| + dataProcessor); |
| } |
| else if (nvmeIntf.getProtocol() == NVMeIntf::Protocol::NVMeMI) |
| { |
| auto intf = |
| std::get<std::shared_ptr<NVMeMiIntf>>(nvmeIntf.getInferface()); |
| |
| - ctemp_fetcher_t<nvme_mi_nvm_ss_health_status*> dataFether = |
| - [intf](std::function<void(const std::error_code&, |
| - nvme_mi_nvm_ss_health_status*)>&& cb) { |
| + ctemp_fetch_t<nvme_mi_nvm_ss_health_status*> dataFether = |
| + [intf, self{std::move(shared_from_this())}]( |
| + std::function<void(const std::error_code&, |
| + nvme_mi_nvm_ss_health_status*)>&& cb) { |
| + // do not poll the health status if the subsystem is intiatilzing |
| + if (self->status == Status::Intiatilzing) |
| + { |
| + std::cerr << "subsystem is intiatilzing, cancel the health poll" |
| + << std::endl; |
| + cb(std::make_error_code(std::errc::operation_canceled), |
| + nullptr); |
| + return; |
| + } |
| intf->miSubsystemHealthStatusPoll(std::move(cb)); |
| }; |
| - ctemp_parser_t<nvme_mi_nvm_ss_health_status*> dataParser = |
| - [self{shared_from_this()}]( |
| - nvme_mi_nvm_ss_health_status* status) -> std::optional<double> { |
| - // Drive Functional |
| - bool df = status->nss & 0x20; |
| - if (!df) |
| + ctemp_process_t<nvme_mi_nvm_ss_health_status*> dataProcessor = |
| + [self{shared_from_this()}](const std::error_code& error, |
| + nvme_mi_nvm_ss_health_status* status) { |
| + if (error == std::errc::operation_canceled || |
| + self->status == Status::Intiatilzing) |
| { |
| + // on initialization, the subsystem will not update the status. |
| + std::cerr |
| + << "subsystem is intiatilzing, do not process the status" |
| + << std::endl; |
| + return; |
| + } |
| + |
| + if (error == std::errc::no_such_device) |
| + { |
| + std::cerr << "error reading ctemp " |
| + "from subsystem" |
| + << ", reason:" << error.message() << "\n"; |
| // stop the subsystem |
| - try |
| + self->markFunctional(false); |
| + self->ctemp->markFunctional(false); |
| + self->ctemp->markAvailable(false); |
| + |
| + return; |
| + } |
| + else if (error) |
| + { |
| + std::cerr << "error reading ctemp " |
| + "from subsystem" |
| + << ", reason:" << error.message() << "\n"; |
| + self->ctemp->incrementError(); |
| + if (self->ctemp->inError()) |
| { |
| self->markFunctional(false); |
| self->ctemp->markFunctional(false); |
| } |
| - catch (const std::runtime_error& e) |
| - { |
| - std::cerr << e.what() << std::endl; |
| - } |
| - return std::nullopt; |
| + return; |
| + } |
| + |
| + // Drive Functional |
| + bool df = status->nss & 0x20; |
| + if (!df) |
| + { |
| + // stop the subsystem |
| + |
| + self->markFunctional(false); |
| + self->ctemp->markFunctional(false); |
| + |
| + return; |
| } |
| |
| // restart the subsystem |
| if (self->status == Status::Stop) |
| { |
| self->markFunctional(true); |
| - self->ctemp->markFunctional(true); |
| } |
| - return {getTemperatureReading(status->ctemp)}; |
| + |
| + // TODO: update the drive interface |
| + |
| + self->ctemp->updateValue(getTemperatureReading(status->ctemp)); |
| + return; |
| }; |
| - pollCtemp(ctempTimer, ctemp, dataFether, dataParser); |
| + |
| + pollCtemp(ctempTimer, std::chrono::seconds(1), dataFether, |
| + dataProcessor); |
| } |
| } |
| void NVMeSubsystem::stop() |
| diff --git a/src/NVMeSubsys.hpp b/src/NVMeSubsys.hpp |
| index e5db58a..6bd8b62 100644 |
| --- a/src/NVMeSubsys.hpp |
| +++ b/src/NVMeSubsys.hpp |
| @@ -4,7 +4,6 @@ |
| #include "NVMeDrive.hpp" |
| #include "NVMeSensor.hpp" |
| #include "NVMeStorage.hpp" |
| -#include "NVMeUtil.hpp" |
| #include "Utils.hpp" |
| |
| class NVMeControllerPlugin; |
| diff --git a/src/NVMeUtil.hpp b/src/NVMeUtil.hpp |
| index 16fe291..8639113 100644 |
| --- a/src/NVMeUtil.hpp |
| +++ b/src/NVMeUtil.hpp |
| @@ -7,6 +7,7 @@ |
| #include <filesystem> |
| #include <iostream> |
| #include <optional> |
| +#include <system_error> |
| |
| inline std::filesystem::path deriveRootBusPath(int busNumber) |
| { |
| @@ -104,60 +105,43 @@ inline std::optional<std::string> |
| // Function type for fetching ctemp which encaplucated in a structure of T. |
| // The fetcher function take a callback as input to process the result. |
| template <class T> |
| -using ctemp_fetcher_t = |
| +using ctemp_fetch_t = |
| std::function<void(std::function<void(const std::error_code&, T)>&&)>; |
| |
| -// Function type for parsing ctemp out the structure of type T. |
| -// The parser function will return the value of ctemp or nullopt on failure. |
| +// Function type for processing ctemp out the structure of type T. |
| +// The process function will update the properties based on input data. |
| template <class T> |
| -using ctemp_parser_t = std::function<std::optional<double>(T data)>; |
| +using ctemp_process_t = |
| + std::function<void(const std::error_code& error, T data)>; |
| |
| template <class T> |
| void pollCtemp( |
| std::shared_ptr<boost::asio::steady_timer> timer, |
| - std::shared_ptr<NVMeSensor> sensor, |
| + std::chrono::duration<double, std::milli> delay, |
| const std::function<void(std::function<void(const std::error_code&, T)>&&)>& |
| dataFetcher, |
| - const std::function<std::optional<double>(T data)>& dataParser); |
| + const std::function<void(const std::error_code& error, T data)>& |
| + dataProcessor); |
| |
| namespace detail |
| { |
| |
| template <class T> |
| void updateCtemp(std::shared_ptr<boost::asio::steady_timer> timer, |
| - std::shared_ptr<NVMeSensor> sensor, |
| - ctemp_parser_t<T> dataParser, ctemp_fetcher_t<T> dataFetcher, |
| - const boost::system::error_code error, T data) |
| + std::chrono::duration<double, std::milli> delay, |
| + ctemp_process_t<T> dataProcessor, ctemp_fetch_t<T> dataFetcher, |
| + const std::error_code& error, T data) |
| { |
| - if (error) |
| - { |
| - std::cerr << "error reading ctemp from subsystem" |
| - << ", reason:" << error.message() << "\n"; |
| - sensor->markFunctional(false); |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, |
| - dataParser); |
| - return; |
| - } |
| - auto value = dataParser(data); |
| - if (!value) |
| - { |
| - sensor->incrementError(); |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, |
| - dataParser); |
| - return; |
| - } |
| - |
| - sensor->updateValue(*value); |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, dataParser); |
| + dataProcessor(error, data); |
| + ::pollCtemp(std::move(timer), std::move(delay), dataFetcher, dataProcessor); |
| } |
| |
| template <class T> |
| void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer, |
| - std::shared_ptr<NVMeSensor> sensor, |
| - ctemp_fetcher_t<T> dataFetcher, ctemp_parser_t<T> dataParser, |
| + std::chrono::duration<double, std::milli> delay, |
| + ctemp_fetch_t<T> dataFetcher, ctemp_process_t<T> dataProcessor, |
| const boost::system::error_code errorCode) |
| { |
| - |
| if (errorCode == boost::asio::error::operation_aborted) |
| { |
| return; |
| @@ -165,37 +149,13 @@ void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer, |
| if (errorCode) |
| { |
| std::cerr << errorCode.message() << "\n"; |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, |
| - dataParser); |
| - return; |
| - } |
| - |
| - if (!sensor) |
| - { |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, |
| - dataParser); |
| - return; |
| - } |
| - |
| - if (!sensor->readingStateGood()) |
| - { |
| - sensor->markAvailable(false); |
| - sensor->updateValue(std::numeric_limits<double>::quiet_NaN()); |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, |
| - dataParser); |
| - return; |
| - } |
| - |
| - /* Potentially defer sampling the sensor sensor if it is in error */ |
| - if (!sensor->sample()) |
| - { |
| - ::pollCtemp(std::move(timer), std::move(sensor), dataFetcher, |
| - dataParser); |
| + ::pollCtemp(std::move(timer), std::move(delay), dataFetcher, |
| + dataProcessor); |
| return; |
| } |
| |
| dataFetcher(std::bind_front(detail::updateCtemp<T>, std::move(timer), |
| - std::move(sensor), dataParser, dataFetcher)); |
| + std::move(delay), dataProcessor, dataFetcher)); |
| } |
| |
| } // namespace detail |
| @@ -203,17 +163,19 @@ void pollCtemp(std::shared_ptr<boost::asio::steady_timer> timer, |
| template <class T> |
| void pollCtemp( |
| std::shared_ptr<boost::asio::steady_timer> timer, |
| - std::shared_ptr<NVMeSensor> sensor, |
| + std::chrono::duration<double, std::milli> delay, |
| const std::function<void(std::function<void(const std::error_code&, T)>&&)>& |
| dataFetcher, |
| - const std::function<std::optional<double>(T data)>& dataParser) |
| + const std::function<void(const std::error_code& error, T data)>& |
| + dataProcessor) |
| { |
| - if (!timer && !sensor) |
| + if (!timer) |
| { |
| return; |
| } |
| - timer->expires_from_now(std::chrono::seconds(1)); |
| + timer->expires_from_now( |
| + std::chrono::duration_cast<std::chrono::milliseconds>(delay)); |
| timer->async_wait(std::bind_front(detail::pollCtemp<T>, std::move(timer), |
| - std::move(sensor), dataFetcher, |
| - dataParser)); |
| + std::move(delay), dataFetcher, |
| + dataProcessor)); |
| } |
| diff --git a/tests/test_nvme_mi.cpp b/tests/test_nvme_mi.cpp |
| index 581f20b..8462d5e 100644 |
| --- a/tests/test_nvme_mi.cpp |
| +++ b/tests/test_nvme_mi.cpp |
| @@ -228,7 +228,8 @@ TEST_F(NVMeTest, TestDriveFunctional) |
| [&](std::function<void(const std::error_code&, |
| nvme_mi_nvm_ss_health_status*)>&& |
| cb) { |
| - std::cerr << "mock health poll" << std::endl; |
| + std::cerr << "mock device not functional health poll" |
| + << std::endl; |
| // return status.nss.df = 0 |
| return io.post([cb = std::move(cb)]() { |
| nvme_mi_nvm_ss_health_status status; |
| @@ -264,7 +265,97 @@ TEST_F(NVMeTest, TestDriveFunctional) |
| EXPECT_EQ(result.size(), 2); |
| |
| subsys->stop(); |
| - io.stop(); |
| + io.post([&]() { io.stop(); }); |
| + }, |
| + "xyz.openbmc_project.ObjectMapper", |
| + "/xyz/openbmc_project/object_mapper", |
| + "xyz.openbmc_project.ObjectMapper", "GetSubTree", |
| + subsys_path, 0, |
| + std::vector<std::string>{ |
| + "xyz.openbmc_project.Inventory." |
| + "Item.StorageController"}); |
| + }); |
| + }, |
| + "xyz.openbmc_project.ObjectMapper", |
| + "/xyz/openbmc_project/object_mapper", |
| + "xyz.openbmc_project.ObjectMapper", "GetSubTree", |
| + subsys_path, 0, |
| + std::vector<std::string>{"xyz.openbmc_project.Inventory." |
| + "Item.StorageController"}); |
| + }); |
| + }, |
| + "xyz.openbmc_project.ObjectMapper", |
| + "/xyz/openbmc_project/object_mapper", |
| + "xyz.openbmc_project.ObjectMapper", "GetSubTree", subsys_path, 0, |
| + std::vector<std::string>{ |
| + "xyz.openbmc_project.Inventory.Item.StorageController"}); |
| + }); |
| + io.run(); |
| +} |
| + |
| +/** |
| + * @brief Test NVMeMi returns Drive is absent (ec = no_such_device) |
| + * |
| + */ |
| +TEST_F(NVMeTest, TestDriveAbsent) |
| +{ |
| + using ::testing::AtLeast; |
| + boost::asio::steady_timer timer(io); |
| + |
| + EXPECT_CALL(mock, miSubsystemHealthStatusPoll).Times(AtLeast(1)); |
| + EXPECT_CALL(mock, adminIdentify).Times(AtLeast(1)); |
| + EXPECT_CALL(mock, miScanCtrl).Times(AtLeast(1)); |
| + |
| + // wait for subsystem initialization |
| + timer.expires_after(std::chrono::seconds(2)); |
| + timer.async_wait([&](boost::system::error_code) { |
| + system_bus->async_method_call( |
| + [&](boost::system::error_code, const GetSubTreeType& result) { |
| + // Only PF and the enabled VF should be listed |
| + EXPECT_EQ(result.size(), 2); |
| + |
| + // mimik communication error of NVMeMI request |
| + ON_CALL(mock, miSubsystemHealthStatusPoll) |
| + .WillByDefault( |
| + [&](std::function<void(const std::error_code&, |
| + nvme_mi_nvm_ss_health_status*)>&& |
| + cb) { |
| + std::cerr << "mock device absent health poll" << std::endl; |
| + // return no_such_device |
| + return io.post([cb = std::move(cb)]() { |
| + cb(std::make_error_code(std::errc::no_such_device), |
| + nullptr); |
| + }); |
| + }); |
| + |
| + // wait for storage controller destruction. |
| + timer.expires_after(std::chrono::seconds(2)); |
| + timer.async_wait([&](boost::system::error_code) { |
| + system_bus->async_method_call( |
| + [&](boost::system::error_code, |
| + const GetSubTreeType& result) { |
| + // no storage controller should be listed. |
| + EXPECT_EQ(result.size(), 0); |
| + |
| + // restart sending normal polling result |
| + ON_CALL(mock, miSubsystemHealthStatusPoll) |
| + .WillByDefault( |
| + [&](std::function<void( |
| + const std::error_code&, |
| + nvme_mi_nvm_ss_health_status*)>&& cb) { |
| + return mock.fake->miSubsystemHealthStatusPoll( |
| + std::move(cb)); |
| + }); |
| + timer.expires_after(std::chrono::seconds(2)); |
| + timer.async_wait([&](boost::system::error_code) { |
| + system_bus->async_method_call( |
| + [&](boost::system::error_code, |
| + const GetSubTreeType& result) { |
| + // storage controller should be restored. |
| + EXPECT_EQ(result.size(), 2); |
| + |
| + subsys->stop(); |
| + io.post([&]() { io.stop(); }); |
| }, |
| "xyz.openbmc_project.ObjectMapper", |
| "/xyz/openbmc_project/object_mapper", |
| -- |
| 2.34.1 |
| |