Add retry to IntelCpuSensor reintialization
As described in b/454633579 flakiness of IntelCpuSensor can occur at startup due to DIMM hwmon files not being present in sysfs. Add a retry here to allow attempting re-creation of these sensors after intial detection that DIMMs are ready. The delay by default will be 5, 30, 60 seconds, since we observe that IntelCpuSensor daemon starts ~2 minutes after bmcweb and does not hit this issue.
This retry attempt will be replaced once smbios table is owned by tlBMC as in scope of Fast Sanity (b/444724588)
#tlbmc
PiperOrigin-RevId: 825678136
Change-Id: I1c8c1429096e042386daf3f84cc2d8b08d5486b0
diff --git a/tlbmc/sensors/intel_cpu_sensor.cc b/tlbmc/sensors/intel_cpu_sensor.cc
index 2b9954b..8b80dd7 100644
--- a/tlbmc/sensors/intel_cpu_sensor.cc
+++ b/tlbmc/sensors/intel_cpu_sensor.cc
@@ -4,6 +4,7 @@
#include <array>
#include <cctype>
#include <charconv>
+#include <chrono> // NOLINT: chrono is commonly used in BMC
#include <cstdint>
#include <cstring>
#include <fstream>
@@ -424,27 +425,13 @@
sensor_attributes_static_.hal_common_config());
auto it = label_to_input_file.find(sensor_label_);
+ std::string error_message =
+ absl::StrCat("Failed to find ", sensor_label_, " in hwmon folder ",
+ peci_device_path.string());
if (it == label_to_input_file.end()) {
- // We expect 32 or 16 DIMM sensors. Since tlBMC must statically configure
- // all sensors, we leave the uninitialized DIMM sensors as is if not found.
- // (b/449180255) - CPU Core sensors vary on machine startup, allow for tlBMC
- // to ignore missing Core sensors from static configuration.
- if (absl::StrContains(sensor_label_, "DIMM") ||
- absl::StrContains(sensor_label_, "Core")) {
- LOG(WARNING) << absl::StrCat("Failed to find ", sensor_label_,
- " in hwmon folder ",
- peci_device_path.string());
- return absl::OkStatus();
- }
+ LOG(WARNING) << error_message;
// Set state to CREATION_FAILED with error message to indicate that the
// sensor was attempted to be created but failed.
- std::string error_message =
- absl::StrCat("Failed to find ", sensor_label_, " in hwmon folder ",
- peci_device_path.string());
- State state;
- state.set_status(STATUS_CREATION_FAILED);
- state.set_status_message(error_message);
- UpdateState(std::move(state));
return absl::NotFoundError(error_message);
}
@@ -467,11 +454,50 @@
return absl::OkStatus();
}
+void IntelCpuSensor::ReinitializeWithRetry(
+ absl::AnyInvocable<void(absl::Status)> callback, uint32_t attempts_left) {
+ absl::Status status = ReinitializeInternal();
+ if (status.ok()) {
+ callback(status);
+ return;
+ }
+
+ LOG(WARNING) << "Failed to reinitialize IntelCpuSensor: " << sensor_name_
+ << " with status: " << status
+ << ". Attempts left: " << attempts_left - 1;
+
+ if (attempts_left < 1) {
+ State state;
+ state.set_status(STATUS_CREATION_FAILED);
+ state.set_status_message(
+ absl::StrCat("Failed to reinitialize IntelCpuSensor after 3 attempts. "
+ "Last status: ",
+ status.message()));
+ UpdateState(std::move(state));
+ callback(status);
+ return;
+ }
+
+ retry_timer_.expires_after(std::chrono::seconds(
+ retry_delays_[retry_delays_.size() - attempts_left]));
+ retry_timer_.async_wait([this, callback = std::move(callback), attempts_left](
+ const boost::system::error_code& ec) mutable {
+ if (ec) {
+ LOG(ERROR) << "Retry timer error: " << ec.message();
+ callback(absl::InternalError(absl::StrCat(
+ "Retry timer error while reinitializing IntelCpuSensor: ",
+ ec.message())));
+ return;
+ }
+ ReinitializeWithRetry(std::move(callback), attempts_left - 1);
+ });
+}
+
void IntelCpuSensor::Reinitialize(
absl::AnyInvocable<void(absl::Status)> callback) {
boost::asio::post(*io_context_,
[this, callback = std::move(callback)]() mutable {
- callback(ReinitializeInternal());
+ ReinitializeWithRetry(std::move(callback), 3);
});
}
@@ -494,10 +520,22 @@
sensor_type_(sensor_type),
sensor_name_(sensor_name),
sensor_label_(sensor_label),
- dts_offset_(dts_offset) {}
+ dts_offset_(dts_offset),
+ retry_timer_(*io_context) {}
-IntelCpuSensor::IntelCpuSensor(const PeciSysfs& peci_sysfs)
- : PeciHwmonBasedSensor("", nullptr, SensorAttributesStatic(),
- ThresholdConfigs(), std::nullopt, peci_sysfs) {}
+IntelCpuSensor::IntelCpuSensor(
+ const PeciSysfs& peci_sysfs, const IntelCpuSensorConfig& config,
+ const std::string& sensor_label,
+ const std::shared_ptr<boost::asio::io_context>& io_context,
+ const std::vector<uint32_t>& retry_delays)
+ : PeciHwmonBasedSensor("", io_context,
+ CreateStaticAttributes(
+ "", SensorUnit::UNIT_UNKNOWN,
+ config.hal_common_config(), EntityCommonConfig(),
+ ReadingRangeConfigs(), ReadingTransformConfig()),
+ ThresholdConfigs(), std::nullopt, peci_sysfs),
+ retry_delays_(retry_delays),
+ sensor_label_(sensor_label),
+ retry_timer_(*io_context) {}
} // namespace milotic_tlbmc
diff --git a/tlbmc/sensors/intel_cpu_sensor.h b/tlbmc/sensors/intel_cpu_sensor.h
index b1816f8..45b3f20 100644
--- a/tlbmc/sensors/intel_cpu_sensor.h
+++ b/tlbmc/sensors/intel_cpu_sensor.h
@@ -116,7 +116,11 @@
IntelCpuSensor() = default;
// For unit testing only.
- explicit IntelCpuSensor(const PeciSysfs& peci_sysfs);
+ explicit IntelCpuSensor(
+ const PeciSysfs& peci_sysfs, const IntelCpuSensorConfig& config,
+ const std::string& sensor_label,
+ const std::shared_ptr<boost::asio::io_context>& io_context,
+ const std::vector<uint32_t>& retry_delays);
static absl::StatusOr<std::tuple<std::string, std::string, std::string>>
SplitInputFile(const boost::filesystem::path& input_file_path);
@@ -136,10 +140,16 @@
absl::Status ReinitializeInternal();
+ void ReinitializeWithRetry(absl::AnyInvocable<void(absl::Status)> callback,
+ uint32_t attempts_left);
+
const IntelCpuSensorType sensor_type_ = INTEL_CPU_SENSOR_TYPE0_UNKNOWN;
+ const std::vector<uint32_t> retry_delays_ = {5, 30, 60};
const std::string sensor_name_;
const std::string sensor_label_;
const double dts_offset_ = 0.0;
+
+ boost::asio::steady_timer retry_timer_;
};
} // namespace milotic_tlbmc