Add Power Fault Log Collector Module and Configs
This CL adds the PowerFaultLogCollectorModule to the tlbmc central config, including:
* New proto messages for the module, config, and log entries.
* Updated proto_config_parser to load the power fault log config.
* Added new files to meson.build and copy.bara.sky.
* Added the module (disabled by default) to tlbmc_config_bundle.textproto and platform JSON configs for tests.
#tlbmc_power_fault_log
PiperOrigin-RevId: 847532034
Change-Id: I4351449b11c3589bf9600ff8d9e633a2a0716ced
diff --git a/tlbmc/central_config.proto b/tlbmc/central_config.proto
index d49d2f1..1cd805d 100644
--- a/tlbmc/central_config.proto
+++ b/tlbmc/central_config.proto
@@ -122,6 +122,11 @@
PowerControlSubmodule power_control_sub_module = 2;
}
+// Controls the Power Fault Log collector module.
+message PowerFaultLogCollectorModule {
+ bool enabled = 1 [default = false];
+}
+
// A proto message to hold all configurations of the modules in tlbmc.
message TlbmcConfig {
string platform_name = 8;
@@ -133,6 +138,7 @@
RedfishRateLimiterModule redfish_rate_limiter_module = 5;
TrustBundleInstallModule trust_bundle_install_module = 6;
GpioCollectorModule gpio_collector_module = 7;
+ PowerFaultLogCollectorModule power_fault_log_collector_module = 9;
}
message TlbmcConfigBundle {
diff --git a/tlbmc/collector/power_fault_log_collector.cc b/tlbmc/collector/power_fault_log_collector.cc
new file mode 100644
index 0000000..056470a
--- /dev/null
+++ b/tlbmc/collector/power_fault_log_collector.cc
@@ -0,0 +1,249 @@
+#include "tlbmc/collector/power_fault_log_collector.h"
+
+#include <algorithm>
+#include <filesystem> // NOLINT
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <system_error> // NOLINT
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "nlohmann/json.hpp"
+#include "power_fault_log_config.pb.h"
+#include "power_fault_log_entry.pb.h"
+#include "tlbmc/scheduler/scheduler.h"
+#include "google/protobuf/json/json.h"
+
+namespace milotic_tlbmc {
+
+namespace fs = std::filesystem;
+absl::StatusOr<std::unique_ptr<PowerFaultLogCollector>>
+PowerFaultLogCollector::Create(const Params& params) {
+ auto thread_manager =
+ std::make_unique<PowerFaultLogThreadManager>(params.clock);
+ if (params.config.enable_detection()) {
+ // TODO(b/464412408): This feature will be implemented in a follow up CL.
+ LOG(INFO) << "Power fault detection object creation is enabled.";
+ } else {
+ LOG(INFO) << "Power fault detection object creation is disabled.";
+ }
+ return absl::WrapUnique(
+ new PowerFaultLogCollector(params.config, std::move(thread_manager)));
+}
+
+absl::Status PowerFaultLogCollector::StartCollection() {
+ // check if the path exists and is a directory
+ fs::path log_path(config_.power_fault_log_path());
+ std::error_code ec;
+ if (!fs::exists(log_path, ec)) {
+ if (ec) {
+ return absl::InternalError(absl::StrCat(
+ "Failed to check if log path exists: ", log_path.string()));
+ }
+ return absl::NotFoundError(
+ absl::StrCat("Log path does not exist: ", log_path.string()));
+ }
+ if (!fs::is_directory(log_path, ec)) {
+ if (ec) {
+ return absl::InternalError(absl::StrCat(
+ "Failed to check if log path is a directory: ", log_path.string()));
+ }
+ return absl::FailedPreconditionError(
+ absl::StrCat("Log path is not a directory: ", log_path.string()));
+ }
+ thread_manager_->task_ids.push_back(
+ thread_manager_->task_scheduler->RunAndScheduleAsync(
+ [this](absl::AnyInvocable<void()> on_done) {
+ DoCollection(std::move(on_done));
+ },
+ absl::Seconds(config_.collection_interval_seconds())));
+ return absl::OkStatus();
+}
+
+PowerFaultLogCollector::PowerFaultLogCollector(
+ const PowerFaultLogConfig& config,
+ std::unique_ptr<PowerFaultLogThreadManager> thread_manager)
+ : config_(config), thread_manager_(std::move(thread_manager)) {}
+
+PowerFaultLogCollector::~PowerFaultLogCollector() {
+ if (thread_manager_) {
+ thread_manager_->task_scheduler->Stop();
+ }
+}
+
+void PowerFaultLogCollector::DoCollection(absl::AnyInvocable<void()> on_done) {
+ fs::path log_path(config_.power_fault_log_path());
+ std::error_code ec;
+ for (const auto& entry : fs::directory_iterator(log_path, ec)) {
+ std::error_code entry_ec;
+ if (!entry.is_directory(entry_ec)) {
+ if (entry_ec) {
+ LOG_EVERY_N_SEC(ERROR, absl::ToInt64Seconds(absl::Minutes(5)))
+ << "Error checking if entry " << entry.path()
+ << " is a directory: " << entry_ec.message();
+ }
+ // Not a directory, skip.
+ continue;
+ }
+ // New directory found. Check if it's already processed.
+ {
+ absl::MutexLock lock(mutex_);
+ if (processed_dirs_.contains(entry.path().filename().string())) {
+ continue; // Already processed and has entries, skip re-adding.
+ }
+ }
+ ProcessNewDirectory(entry.path());
+ }
+ if (ec) { // Check for error after loop
+ LOG_EVERY_N_SEC(ERROR, absl::ToInt64Seconds(absl::Minutes(5)))
+ << "Error finishing directory iteration " << log_path << ": "
+ << ec.message();
+ }
+
+ if (on_done) {
+ on_done();
+ }
+}
+
+void PowerFaultLogCollector::ProcessNewDirectory(const fs::path& dir_path) {
+ std::string folder_name = dir_path.filename().string();
+ LOG(INFO) << "Processing new directory: " << folder_name;
+ if (folder_name.ends_with(".tmp")) {
+ LOG(INFO) << "Skipping current directory: " << folder_name;
+ return;
+ }
+ std::vector<std::string> file_names;
+ std::error_code ec;
+ for (const auto& entry : fs::directory_iterator(dir_path, ec)) {
+ std::error_code entry_ec;
+ if (!entry.is_regular_file(entry_ec)) {
+ if (entry_ec) {
+ LOG(ERROR) << "Error checking if entry " << entry.path()
+ << " is a regular file: " << entry_ec.message();
+ }
+ // Not a regular file, skip.
+ continue;
+ }
+ file_names.push_back(entry.path().filename().string());
+ }
+ if (ec) { // Check for error after loop
+ LOG(ERROR) << "Error finishing directory iteration " << dir_path << ": "
+ << ec.message();
+ // If there is an error when iterating the directory, we should not add
+ // the directory to processed_dirs_ to avoid re-scanning the directory.
+ absl::MutexLock lock(mutex_);
+ processed_dirs_.insert(folder_name);
+ return;
+ }
+ if (file_names.empty()) {
+ LOG(INFO) << "No files found in directory: " << folder_name;
+ return;
+ }
+ // Now we know the directory has files. Only add to collected_entries_ and
+ // processed_dirs_ if it's genuinely new.
+ PowerFaultLogEntry new_entry;
+ new_entry.set_folder_name(folder_name);
+ for (const auto& file_name : file_names) {
+ new_entry.add_file_names(file_name);
+ }
+ absl::MutexLock lock(mutex_);
+ processed_dirs_.insert(folder_name);
+ *collected_entries_.add_entries() = new_entry;
+ LOG(INFO) << "Added entry for folder: " << folder_name;
+}
+
+PowerFaultLogEntries PowerFaultLogCollector::GetCollectedEntries() const {
+ absl::MutexLock lock(mutex_);
+ return collected_entries_;
+}
+
+absl::StatusOr<std::string> PowerFaultLogCollector::GetLogFileContent(
+ absl::string_view folder_name, absl::string_view file_name) const {
+ // Check if the file is in the collected entries to avoid unauthorized access.
+ {
+ absl::MutexLock lock(mutex_);
+ const auto it = std::find_if(
+ collected_entries_.entries().begin(),
+ collected_entries_.entries().end(), [&](const auto& entry) {
+ if (entry.folder_name() != folder_name) {
+ return false;
+ }
+ return std::any_of(entry.file_names().begin(),
+ entry.file_names().end(),
+ [&](const auto& f) { return f == file_name; });
+ });
+ if (it == collected_entries_.entries().end()) {
+ return absl::NotFoundError(
+ absl::StrCat("File not found in collected entries: ", folder_name,
+ "/", file_name));
+ }
+ }
+ fs::path file_path =
+ fs::path(config_.power_fault_log_path()) / folder_name / file_name;
+ std::error_code ec;
+ if (!fs::exists(file_path, ec) || !fs::is_regular_file(file_path, ec)) {
+ return absl::NotFoundError(
+ absl::StrCat("File not found: ", file_path.string()));
+ }
+ std::ifstream file(file_path);
+ if (!file.is_open()) {
+ return absl::InternalError(
+ absl::StrCat("Failed to open file: ", file_path.string()));
+ }
+ std::stringstream buffer;
+ buffer << file.rdbuf();
+ return buffer.str();
+}
+
+nlohmann::json PowerFaultLogCollector::ToJson() const {
+ absl::MutexLock lock(mutex_);
+ std::string json_string;
+ if (!::google::protobuf::json::MessageToJsonString(collected_entries_, &json_string)
+ .ok()) {
+ LOG(ERROR) << "Failed to convert PowerFaultLogEntries to JSON string.";
+ return nlohmann::json();
+ }
+ return nlohmann::json::parse(json_string, nullptr, false);
+}
+
+nlohmann::json PowerFaultLogCollector::GetSchedulerStats() const {
+ return thread_manager_->task_scheduler->ToJson();
+}
+
+std::unique_ptr<EmptyPowerFaultLogCollector>
+EmptyPowerFaultLogCollector::Create() {
+ return std::make_unique<EmptyPowerFaultLogCollector>();
+}
+
+absl::Status EmptyPowerFaultLogCollector::StartCollection() {
+ return absl::OkStatus();
+}
+
+absl::StatusOr<std::string> EmptyPowerFaultLogCollector::GetLogFileContent(
+ absl::string_view folder_name, absl::string_view file_name) const {
+ return absl::UnimplementedError(
+ "GetLogFileContent is not implemented for EmptyPowerFaultLogCollector.");
+}
+
+nlohmann::json EmptyPowerFaultLogCollector::ToJson() const {
+ return nlohmann::json::parse(
+ "{\"Warning\": \"EmptyPowerFaultLogCollector used.\"}");
+}
+
+nlohmann::json EmptyPowerFaultLogCollector::GetSchedulerStats() const {
+ return nlohmann::json::parse(
+ "{\"Warning\": \"EmptyPowerFaultLogCollector used.\"}");
+}
+
+} // namespace milotic_tlbmc
diff --git a/tlbmc/collector/power_fault_log_collector.h b/tlbmc/collector/power_fault_log_collector.h
new file mode 100644
index 0000000..d9f64df
--- /dev/null
+++ b/tlbmc/collector/power_fault_log_collector.h
@@ -0,0 +1,89 @@
+#ifndef THIRD_PARTY_MILOTIC_EXTERNAL_CC_TLBMC_COLLECTOR_POWER_FAULT_LOG_COLLECTOR_H_
+#define THIRD_PARTY_MILOTIC_EXTERNAL_CC_TLBMC_COLLECTOR_POWER_FAULT_LOG_COLLECTOR_H_
+#include <filesystem> // NOLINT
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "time/clock.h"
+#include "nlohmann/json.hpp"
+#include "tlbmc/collector/collector.h"
+#include "power_fault_log_config.pb.h"
+#include "power_fault_log_entry.pb.h"
+#include "tlbmc/scheduler/scheduler.h"
+
+namespace milotic_tlbmc {
+
+struct PowerFaultLogThreadManager {
+ explicit PowerFaultLogThreadManager(ecclesia::Clock* clock)
+ : task_scheduler(std::make_unique<TaskScheduler>(clock)) {}
+ std::unique_ptr<TaskScheduler> task_scheduler;
+ std::vector<int> task_ids;
+};
+
+// This class is used to collect power fault logs from a directory. It is
+// responsible for monitoring the directory for new logs, parsing them, and
+// storing them in a PowerFaultLogEntries proto. It expects that a folder
+// represents a single fault event and the folder contains multiple log files.
+class PowerFaultLogCollector : public Collector {
+ public:
+ struct Params {
+ ecclesia::Clock* clock = ecclesia::Clock::RealClock();
+ PowerFaultLogConfig config;
+ };
+ static absl::StatusOr<std::unique_ptr<PowerFaultLogCollector>> Create(
+ const Params& params);
+ ~PowerFaultLogCollector() override;
+ // Starts monitoring the log path.
+ virtual absl::Status StartCollection();
+ // Performs a single collection of the log path.
+ virtual void DoCollection(absl::AnyInvocable<void()> on_done);
+ // Returns a copy of the collected entries.
+ virtual PowerFaultLogEntries GetCollectedEntries() const;
+ // Returns the content of the log file.
+ virtual absl::StatusOr<std::string> GetLogFileContent(
+ absl::string_view folder_name, absl::string_view file_name) const;
+ nlohmann::json ToJson() const override;
+ nlohmann::json GetSchedulerStats() const override;
+
+ protected:
+ PowerFaultLogCollector() = default;
+
+ private:
+ PowerFaultLogCollector(
+ const PowerFaultLogConfig& config,
+ std::unique_ptr<PowerFaultLogThreadManager> thread_manager);
+ // Processes a new directory found in the log path.
+ void ProcessNewDirectory(const std::filesystem::path& dir_path);
+ PowerFaultLogConfig config_;
+ std::unique_ptr<PowerFaultLogThreadManager> thread_manager_;
+ mutable absl::Mutex mutex_;
+ PowerFaultLogEntries collected_entries_ ABSL_GUARDED_BY(mutex_);
+ absl::flat_hash_set<std::string> processed_dirs_ ABSL_GUARDED_BY(mutex_);
+};
+
+class EmptyPowerFaultLogCollector final : public PowerFaultLogCollector {
+ public:
+ static std::unique_ptr<EmptyPowerFaultLogCollector> Create();
+
+ absl::Status StartCollection() override;
+
+ absl::StatusOr<std::string> GetLogFileContent(
+ absl::string_view folder_name,
+ absl::string_view file_name) const override;
+
+ nlohmann::json ToJson() const override;
+
+ nlohmann::json GetSchedulerStats() const override;
+};
+
+} // namespace milotic_tlbmc
+
+#endif // THIRD_PARTY_MILOTIC_EXTERNAL_CC_TLBMC_COLLECTOR_POWER_FAULT_LOG_COLLECTOR_H_
diff --git a/tlbmc/configs/proto_config_parser.cc b/tlbmc/configs/proto_config_parser.cc
index 98b283f..d5b44df 100644
--- a/tlbmc/configs/proto_config_parser.cc
+++ b/tlbmc/configs/proto_config_parser.cc
@@ -49,6 +49,8 @@
absl::StrCat(config_path_, "power_control_configs.textproto"));
data_store_.thermal_configs = GetConfigFromProto<ThermalConfigs>(
absl::StrCat(config_path_, "thermal_config.textproto"));
+ data_store_.power_fault_log_config = GetConfigFromProto<PowerFaultLogConfig>(
+ absl::StrCat(config_path_, "power_fault_log_config.textproto"));
}
SoftwareMetricsConfig ProtoConfigParser::GetSoftwareMetricsConfig() const {
@@ -67,6 +69,10 @@
return data_store_.thermal_configs;
}
+PowerFaultLogConfig ProtoConfigParser::GetPowerFaultLogConfig() const {
+ return data_store_.power_fault_log_config;
+}
+
std::unique_ptr<ProtoConfigParser> ProtoConfigParser::Create(
absl::string_view config_path) {
return std::make_unique<ProtoConfigParser>(config_path);
diff --git a/tlbmc/configs/proto_config_parser.h b/tlbmc/configs/proto_config_parser.h
index a0eda6d..6597bf5 100644
--- a/tlbmc/configs/proto_config_parser.h
+++ b/tlbmc/configs/proto_config_parser.h
@@ -9,6 +9,7 @@
#include "absl/strings/string_view.h"
#include "gpio_config.pb.h"
#include "power_control.pb.h"
+#include "power_fault_log_config.pb.h"
#include "software_metrics_config.pb.h"
#include "thermal_config.pb.h"
@@ -19,6 +20,7 @@
GpioConfigs gpio_configs;
PowerControlConfigs power_control_configs;
ThermalConfigs thermal_configs;
+ PowerFaultLogConfig power_fault_log_config;
};
class ProtoConfigParser {
@@ -31,6 +33,7 @@
GpioConfigs GetGpioConfigs() const;
PowerControlConfigs GetPowerControlConfigs() const;
ThermalConfigs GetThermalConfigs() const;
+ PowerFaultLogConfig GetPowerFaultLogConfig() const;
explicit ProtoConfigParser(absl::string_view config_path)
: config_path_(config_path) {}
diff --git a/tlbmc/meson.build b/tlbmc/meson.build
index 830de41..2881549 100644
--- a/tlbmc/meson.build
+++ b/tlbmc/meson.build
@@ -30,6 +30,8 @@
'nic_telemetry_config.proto',
'payload.proto',
'power_control.proto',
+ 'power_fault_log_config.proto',
+ 'power_fault_log_entry.proto',
'psu_sensor_config.proto',
'reading_range_config.proto',
'reading_transform_config.proto',
@@ -154,6 +156,7 @@
'collector/gpio_collector.cc',
'collector/metric_collector.cc',
'collector/peci_scanner.cc',
+ 'collector/power_fault_log_collector.cc',
'collector/sensor_collector.cc',
'collector/thermal_collector.cc',
'configs/blocklist_parser.cc',
diff --git a/tlbmc/power_fault_log_config.proto b/tlbmc/power_fault_log_config.proto
new file mode 100644
index 0000000..a818629
--- /dev/null
+++ b/tlbmc/power_fault_log_config.proto
@@ -0,0 +1,12 @@
+edition = "2023";
+
+package milotic_tlbmc;
+
+message PowerFaultLogConfig {
+ // The interval in seconds between each power fault log collection.
+ int32 collection_interval_seconds = 1;
+ // Root path to the power fault log directory.
+ string power_fault_log_path = 2;
+ // Whether to enable the power fault log detection.
+ bool enable_detection = 3;
+}
diff --git a/tlbmc/power_fault_log_entry.proto b/tlbmc/power_fault_log_entry.proto
new file mode 100644
index 0000000..448aa7b
--- /dev/null
+++ b/tlbmc/power_fault_log_entry.proto
@@ -0,0 +1,12 @@
+edition = "2023";
+
+package milotic_tlbmc;
+
+message PowerFaultLogEntry {
+ string folder_name = 1;
+ repeated string file_names = 2;
+}
+
+message PowerFaultLogEntries {
+ repeated PowerFaultLogEntry entries = 1;
+}