Add retry to IntelCpuSensor reintialization

As described in b/454633579 flakiness of IntelCpuSensor can occur at startup due to DIMM hwmon files not being present in sysfs. Add a retry here to allow attempting re-creation of these sensors after intial detection that DIMMs are ready. The delay by default will be 5, 30, 60 seconds, since we observe that IntelCpuSensor daemon starts ~2 minutes after bmcweb and does not hit this issue.

This retry attempt will be replaced once smbios table is owned by tlBMC as in scope of Fast Sanity (b/444724588)

#tlbmc

PiperOrigin-RevId: 825678136
Change-Id: I1c8c1429096e042386daf3f84cc2d8b08d5486b0
diff --git a/tlbmc/sensors/intel_cpu_sensor.cc b/tlbmc/sensors/intel_cpu_sensor.cc
index 2b9954b..8b80dd7 100644
--- a/tlbmc/sensors/intel_cpu_sensor.cc
+++ b/tlbmc/sensors/intel_cpu_sensor.cc
@@ -4,6 +4,7 @@
 #include <array>
 #include <cctype>
 #include <charconv>
+#include <chrono>  // NOLINT: chrono is commonly used in BMC
 #include <cstdint>
 #include <cstring>
 #include <fstream>
@@ -424,27 +425,13 @@
                                 sensor_attributes_static_.hal_common_config());
 
   auto it = label_to_input_file.find(sensor_label_);
+  std::string error_message =
+      absl::StrCat("Failed to find ", sensor_label_, " in hwmon folder ",
+                   peci_device_path.string());
   if (it == label_to_input_file.end()) {
-    // We expect 32 or 16 DIMM sensors. Since tlBMC must statically configure
-    // all sensors, we leave the uninitialized DIMM sensors as is if not found.
-    // (b/449180255) - CPU Core sensors vary on machine startup, allow for tlBMC
-    // to ignore missing Core sensors from static configuration.
-    if (absl::StrContains(sensor_label_, "DIMM") ||
-        absl::StrContains(sensor_label_, "Core")) {
-      LOG(WARNING) << absl::StrCat("Failed to find ", sensor_label_,
-                                   " in hwmon folder ",
-                                   peci_device_path.string());
-      return absl::OkStatus();
-    }
+    LOG(WARNING) << error_message;
     // Set state to CREATION_FAILED with error message to indicate that the
     // sensor was attempted to be created but failed.
-    std::string error_message =
-        absl::StrCat("Failed to find ", sensor_label_, " in hwmon folder ",
-                     peci_device_path.string());
-    State state;
-    state.set_status(STATUS_CREATION_FAILED);
-    state.set_status_message(error_message);
-    UpdateState(std::move(state));
     return absl::NotFoundError(error_message);
   }
 
@@ -467,11 +454,50 @@
   return absl::OkStatus();
 }
 
+void IntelCpuSensor::ReinitializeWithRetry(
+    absl::AnyInvocable<void(absl::Status)> callback, uint32_t attempts_left) {
+  absl::Status status = ReinitializeInternal();
+  if (status.ok()) {
+    callback(status);
+    return;
+  }
+
+  LOG(WARNING) << "Failed to reinitialize IntelCpuSensor: " << sensor_name_
+               << " with status: " << status
+               << ". Attempts left: " << attempts_left - 1;
+
+  if (attempts_left < 1) {
+    State state;
+    state.set_status(STATUS_CREATION_FAILED);
+    state.set_status_message(
+        absl::StrCat("Failed to reinitialize IntelCpuSensor after 3 attempts. "
+                     "Last status: ",
+                     status.message()));
+    UpdateState(std::move(state));
+    callback(status);
+    return;
+  }
+
+  retry_timer_.expires_after(std::chrono::seconds(
+      retry_delays_[retry_delays_.size() - attempts_left]));
+  retry_timer_.async_wait([this, callback = std::move(callback), attempts_left](
+                              const boost::system::error_code& ec) mutable {
+    if (ec) {
+      LOG(ERROR) << "Retry timer error: " << ec.message();
+      callback(absl::InternalError(absl::StrCat(
+          "Retry timer error while reinitializing IntelCpuSensor: ",
+          ec.message())));
+      return;
+    }
+    ReinitializeWithRetry(std::move(callback), attempts_left - 1);
+  });
+}
+
 void IntelCpuSensor::Reinitialize(
     absl::AnyInvocable<void(absl::Status)> callback) {
   boost::asio::post(*io_context_,
                     [this, callback = std::move(callback)]() mutable {
-                      callback(ReinitializeInternal());
+                      ReinitializeWithRetry(std::move(callback), 3);
                     });
 }
 
@@ -494,10 +520,22 @@
       sensor_type_(sensor_type),
       sensor_name_(sensor_name),
       sensor_label_(sensor_label),
-      dts_offset_(dts_offset) {}
+      dts_offset_(dts_offset),
+      retry_timer_(*io_context) {}
 
-IntelCpuSensor::IntelCpuSensor(const PeciSysfs& peci_sysfs)
-    : PeciHwmonBasedSensor("", nullptr, SensorAttributesStatic(),
-                           ThresholdConfigs(), std::nullopt, peci_sysfs) {}
+IntelCpuSensor::IntelCpuSensor(
+    const PeciSysfs& peci_sysfs, const IntelCpuSensorConfig& config,
+    const std::string& sensor_label,
+    const std::shared_ptr<boost::asio::io_context>& io_context,
+    const std::vector<uint32_t>& retry_delays)
+    : PeciHwmonBasedSensor("", io_context,
+                           CreateStaticAttributes(
+                               "", SensorUnit::UNIT_UNKNOWN,
+                               config.hal_common_config(), EntityCommonConfig(),
+                               ReadingRangeConfigs(), ReadingTransformConfig()),
+                           ThresholdConfigs(), std::nullopt, peci_sysfs),
+      retry_delays_(retry_delays),
+      sensor_label_(sensor_label),
+      retry_timer_(*io_context) {}
 
 }  // namespace milotic_tlbmc
diff --git a/tlbmc/sensors/intel_cpu_sensor.h b/tlbmc/sensors/intel_cpu_sensor.h
index b1816f8..45b3f20 100644
--- a/tlbmc/sensors/intel_cpu_sensor.h
+++ b/tlbmc/sensors/intel_cpu_sensor.h
@@ -116,7 +116,11 @@
   IntelCpuSensor() = default;
 
   // For unit testing only.
-  explicit IntelCpuSensor(const PeciSysfs& peci_sysfs);
+  explicit IntelCpuSensor(
+      const PeciSysfs& peci_sysfs, const IntelCpuSensorConfig& config,
+      const std::string& sensor_label,
+      const std::shared_ptr<boost::asio::io_context>& io_context,
+      const std::vector<uint32_t>& retry_delays);
 
   static absl::StatusOr<std::tuple<std::string, std::string, std::string>>
   SplitInputFile(const boost::filesystem::path& input_file_path);
@@ -136,10 +140,16 @@
 
   absl::Status ReinitializeInternal();
 
+  void ReinitializeWithRetry(absl::AnyInvocable<void(absl::Status)> callback,
+                             uint32_t attempts_left);
+
   const IntelCpuSensorType sensor_type_ = INTEL_CPU_SENSOR_TYPE0_UNKNOWN;
+  const std::vector<uint32_t> retry_delays_ = {5, 30, 60};
   const std::string sensor_name_;
   const std::string sensor_label_;
   const double dts_offset_ = 0.0;
+
+  boost::asio::steady_timer retry_timer_;
 };
 
 }  // namespace milotic_tlbmc