| #include "tlbmc/pacemaker/pacemaker.h" |
| |
| #include <cstdint> |
| #include <cstdio> |
| #include <cstdlib> |
| #include <memory> |
| #include <string> |
| #include <utility> |
| |
| #include "absl/functional/any_invocable.h" |
| #include "absl/log/log.h" |
| #include "absl/status/status.h" |
| #include "absl/status/statusor.h" |
| #include "absl/strings/ascii.h" |
| #include "absl/strings/match.h" |
| #include "absl/strings/numbers.h" |
| #include "absl/strings/str_cat.h" |
| #include "absl/strings/str_format.h" |
| #include "absl/synchronization/mutex.h" |
| #include "absl/time/clock.h" |
| #include "absl/time/time.h" |
| #include "g3/macros.h" |
| #include "nlohmann/json.hpp" |
| |
| namespace milotic_tlbmc { |
| |
| namespace { |
| |
| constexpr const int kMaxConsecutiveRestartAttempts = 10; |
| constexpr const char* kProcessName = "bmcweb"; |
| constexpr const int kExpectedListeningPort = 443; |
| constexpr const int kMemoryUsageThreshold = 300 * 1024 * 1024; // 300MB |
| constexpr const char* kIsActiveCommand = "systemctl is-active %s"; |
| constexpr const char* kActiveStatus = "active"; |
| constexpr const char* kRestartCommand = "systemctl restart %s"; |
| constexpr const char* kIsListeningCommand = "ss -tulpn | grep %d"; |
| constexpr const char* kGetMemoryUsageCommand = |
| "cat /proc/%d/status | grep VmRSS | awk '{print $2}'"; |
| constexpr const char* kGetPidCommand = |
| "systemctl status %s | grep 'Main PID:' | awk '{print $3}'"; |
| constexpr const char* kGetCpuUsageCommand = |
| "top -n1 | grep %s -i | awk '{print int($8)}'"; |
| constexpr const char* kGetActiveEnterTimestampMonotonic = |
| "systemctl show --property=ActiveEnterTimestampMonotonic %s | awk -F'=' " |
| "'{print $2}'"; |
| constexpr const int kMaxRecentDataPoints = 12; |
| |
| } // namespace |
| |
| nlohmann::json Pacemaker::ErrorInfo::ToJson() const { |
| auto error_type_to_string = [](ErrorType type) -> std::string { |
| switch (type) { |
| case ErrorType::kUnknown: |
| return "Unknown"; |
| case ErrorType::kServiceInactive: |
| return "ServiceInactive"; |
| case ErrorType::kPortNotListening: |
| return "PortNotListening"; |
| case ErrorType::kMemoryUsageAboveThreshold: |
| return "MemoryUsageAboveThreshold"; |
| default: |
| return "Unknown"; |
| } |
| }; |
| |
| nlohmann::json json; |
| json["type"] = error_type_to_string(type); |
| json["timestamp"] = absl::FormatTime(timestamp); |
| return json; |
| } |
| |
| nlohmann::json Pacemaker::MonitoredData::ToJson() const { |
| nlohmann::json json; |
| // Get the last kMaxRecentDataPoints memory usage data points. |
| auto it = memory_usage_bytes.rbegin(); |
| for (int i = 0; i < kMaxRecentDataPoints && it != memory_usage_bytes.rend(); |
| ++i, ++it) { |
| json["MemoryUsageRecentToOldest"].push_back(*it); |
| } |
| // Get the last kMaxRecentDataPoints error data points. |
| auto it_error = restart_log.rbegin(); |
| for (int i = 0; i < kMaxRecentDataPoints && it_error != restart_log.rend(); |
| ++i, ++it_error) { |
| json["ErrorsRecentToOldest"].push_back(it_error->ToJson()); |
| } |
| json["CpuUsage"] = cpu_usage; |
| json["MemoryUsage"] = |
| memory_usage_bytes.empty() ? -1 : memory_usage_bytes.back(); |
| json["LastActiveTimestamp"] = last_active_timestamp; |
| json["LastResetTime"] = absl::FormatTime(last_reset_time); |
| json["RestartTriggered"] = restart_triggered; |
| json["Pid"] = pid; |
| json["ConsecutiveRestartAttempts"] = consecutive_restart_attempts; |
| return json; |
| } |
| |
| absl::StatusOr<std::string> ShellCommandExecutor::Execute( |
| const std::string& command) { |
| FILE* pipe = popen(command.c_str(), "r"); |
| if (pipe == nullptr) { |
| return absl::InternalError( |
| absl::StrCat("Error: Failed to execute command: ", command)); |
| } |
| std::string result; |
| char* line = nullptr; |
| size_t len = 0; |
| ssize_t read; |
| |
| while ((read = getline(&line, &len, pipe)) != -1) { |
| result.append(line, static_cast<std::string::size_type>(read)); |
| } |
| |
| free(line); |
| pclose(pipe); |
| return result; |
| } |
| |
| absl::StatusOr<bool> Pacemaker::IsServiceActive( |
| const std::string& service_name) const { |
| LOG(INFO) << "Checking if service " << service_name << " is active."; |
| ECCLESIA_ASSIGN_OR_RETURN( |
| std::string output, shell_command_executor_->Execute( |
| absl::StrFormat(kIsActiveCommand, service_name))); |
| absl::StripAsciiWhitespace(&output); |
| // Compare string |
| if (!absl::EqualsIgnoreCase(output, "active")) { |
| LOG(ERROR) << "Service " << service_name << " is not active." |
| << " status is " << output << " instead of " << kActiveStatus; |
| return false; |
| } |
| return true; |
| } |
| |
| absl::StatusOr<bool> Pacemaker::IsPortListening(int port) const { |
| LOG(INFO) << "Checking if port " << port << " is listening."; |
| ECCLESIA_ASSIGN_OR_RETURN(std::string output, |
| shell_command_executor_->Execute( |
| absl::StrFormat(kIsListeningCommand, port))); |
| return !output.empty(); |
| } |
| |
| absl::StatusOr<int> Pacemaker::GetMemoryUsage(int pid) const { |
| LOG(INFO) << "Getting memory usage for pid " << pid; |
| ECCLESIA_ASSIGN_OR_RETURN(std::string output, |
| shell_command_executor_->Execute( |
| absl::StrFormat(kGetMemoryUsageCommand, pid))); |
| int memory_usage; |
| if (!absl::SimpleAtoi(output, &memory_usage)) { |
| return absl::InternalError( |
| absl::StrCat("Error: Failed to parse memory usage: ", output)); |
| } |
| return memory_usage; |
| } |
| |
| absl::StatusOr<int> Pacemaker::GetCpuUsage( |
| const std::string& process_name) const { |
| LOG(INFO) << "Getting CPU usage for process " << process_name; |
| ECCLESIA_ASSIGN_OR_RETURN(std::string output, |
| shell_command_executor_->Execute(absl::StrFormat( |
| kGetCpuUsageCommand, process_name))); |
| int cpu_usage = -1; |
| if (output.empty()) { |
| return cpu_usage; |
| } |
| if (!absl::SimpleAtoi(output, &cpu_usage)) { |
| return absl::InternalError( |
| absl::StrCat("Error: Failed to parse CPU usage: ", output)); |
| } |
| return cpu_usage; |
| } |
| |
| absl::StatusOr<int64_t> Pacemaker::GetLastActiveTimestamp( |
| const std::string& process_name) const { |
| LOG(INFO) << "Getting last active timestamp for process " << process_name; |
| ECCLESIA_ASSIGN_OR_RETURN( |
| std::string output, |
| shell_command_executor_->Execute( |
| absl::StrFormat(kGetActiveEnterTimestampMonotonic, process_name))); |
| int64_t last_active_timestamp; |
| if (!absl::SimpleAtoi(output, &last_active_timestamp)) { |
| return absl::InternalError( |
| absl::StrCat("Error: Failed to parse last reset time: ", output)); |
| } |
| return last_active_timestamp; |
| } |
| |
| absl::Status Pacemaker::RestartService(const std::string& service_name) { |
| LOG(ERROR) << "Restarting service " << service_name; |
| { |
| absl::MutexLock lock(&mutex_); |
| if (monitored_data_.restart_triggered) { |
| if (monitored_data_.consecutive_restart_attempts >= |
| kMaxConsecutiveRestartAttempts) { |
| LOG(ERROR) << "Restart attempts exceeded the limit of " |
| << kMaxConsecutiveRestartAttempts; |
| return absl::InternalError( |
| absl::StrCat("Error: Restart attempts exceeded the limit of ", |
| kMaxConsecutiveRestartAttempts)); |
| } |
| ++monitored_data_.consecutive_restart_attempts; |
| } |
| monitored_data_.restart_triggered = true; |
| monitored_data_.last_reset_time = absl::Now(); |
| } |
| ECCLESIA_ASSIGN_OR_RETURN( |
| std::string output, shell_command_executor_->Execute( |
| absl::StrFormat(kRestartCommand, service_name))); |
| return absl::OkStatus(); |
| } |
| |
| // Get PID of the process. |
| absl::StatusOr<int> Pacemaker::GetPid(const std::string& process_name) const { |
| LOG(INFO) << "Getting PID for process " << process_name; |
| ECCLESIA_ASSIGN_OR_RETURN(std::string output, |
| shell_command_executor_->Execute( |
| absl::StrFormat(kGetPidCommand, process_name))); |
| int pid = -1; |
| if (output.empty()) { |
| return absl::InternalError( |
| absl::StrCat("Error: Main PID not found for service: ", process_name)); |
| } |
| // Trim leading and trailing whitespace, including newlines. |
| output = std::string(absl::StripAsciiWhitespace(output)); |
| |
| if (absl::SimpleAtoi(output, &pid)) { |
| return pid; |
| } |
| return absl::InternalError( |
| absl::StrCat("Error: Invalid PID found in systemctl output: ", output, |
| " for service: ", process_name)); |
| } |
| |
| void Pacemaker::RecordError(ErrorType type) { |
| absl::MutexLock lock(&mutex_); |
| // If the error type is unknown, we can't determine the timestamp. |
| // So we just record the error and continue. |
| if (type == ErrorType::kUnknown) { |
| monitored_data_.restart_log.push_back({type, absl::InfinitePast()}); |
| return; |
| } |
| ErrorInfo error_info = {type, absl::Now()}; |
| LOG(ERROR) << "Restart required due to Error:\n" |
| << error_info.ToJson().dump(2); |
| monitored_data_.restart_log.push_back(error_info); |
| } |
| |
| absl::Status Pacemaker::PerformChecks() { |
| bool unknown_restart_detected = false; |
| ECCLESIA_ASSIGN_OR_RETURN(bool is_active, IsServiceActive(kProcessName)); |
| ECCLESIA_ASSIGN_OR_RETURN(bool is_listening, |
| IsPortListening(kExpectedListeningPort)); |
| ECCLESIA_ASSIGN_OR_RETURN(int pid, GetPid(kProcessName)); |
| ECCLESIA_ASSIGN_OR_RETURN(int cpu_usage, GetCpuUsage(kProcessName)); |
| int memory_usage = -1; |
| if (pid > 0) { |
| ECCLESIA_ASSIGN_OR_RETURN(memory_usage, GetMemoryUsage(pid)); |
| } |
| |
| absl::StatusOr<int64_t> last_active_timestamp = |
| GetLastActiveTimestamp(kProcessName); |
| if (!last_active_timestamp.ok()) { |
| LOG(ERROR) << "Failed to get last active timestamp: " |
| << last_active_timestamp.status(); |
| } |
| |
| // Update monitored data. |
| { |
| absl::MutexLock lock(&mutex_); |
| monitored_data_.memory_usage_bytes.push_back(memory_usage); |
| monitored_data_.cpu_usage = cpu_usage; |
| monitored_data_.pid = pid; |
| |
| // Check if the process was restarted outside of pacemaker. |
| if (last_active_timestamp.ok()) { |
| // We check if the `last_active_timestamp` we got from systemctl |
| // is different from the one we got from the previous run and if the |
| // process was not restarted by pacemaker. |
| // If the timestamps are different, it means the process was restarted |
| // outside of pacemaker and we should record it as an unknown error. |
| if (monitored_data_.last_active_timestamp != -1 && |
| *last_active_timestamp != monitored_data_.last_active_timestamp && |
| !monitored_data_.restart_triggered) { |
| LOG(INFO) << "Process " << kProcessName |
| << " was restarted outside of pacemaker."; |
| unknown_restart_detected = true; |
| } |
| monitored_data_.last_active_timestamp = *last_active_timestamp; |
| } |
| } |
| |
| if (unknown_restart_detected) { |
| RecordError(ErrorType::kUnknown); |
| } |
| |
| if (!is_active) { |
| RecordError(ErrorType::kServiceInactive); |
| ECCLESIA_RETURN_IF_ERROR(RestartService(kProcessName)); |
| return absl::OkStatus(); |
| } |
| |
| if (!is_listening) { |
| LOG(ERROR) << "Process " << kProcessName << " is not listening on port " |
| << kExpectedListeningPort; |
| RecordError(ErrorType::kPortNotListening); |
| ECCLESIA_RETURN_IF_ERROR(RestartService(kProcessName)); |
| return absl::OkStatus(); |
| } |
| |
| if (memory_usage > kMemoryUsageThreshold) { |
| LOG(ERROR) << "Process " << kProcessName << " memory usage is " |
| << memory_usage << " bytes which is more than the threshold of " |
| << kMemoryUsageThreshold; |
| RecordError(ErrorType::kMemoryUsageAboveThreshold); |
| ECCLESIA_RETURN_IF_ERROR(RestartService(kProcessName)); |
| return absl::OkStatus(); |
| } |
| |
| // If we reach here, it means all checks passed. |
| // Reset the restart triggered and consecutive restart attempts counters. |
| absl::MutexLock lock(&mutex_); |
| monitored_data_.restart_triggered = false; |
| monitored_data_.consecutive_restart_attempts = 0; |
| return absl::OkStatus(); |
| } |
| |
| nlohmann::json Pacemaker::GetMonitoringData() const { |
| nlohmann::json json; |
| { |
| absl::MutexLock lock(&mutex_); |
| json = monitored_data_.ToJson(); |
| } |
| json["PacemakerSchedulerStats"] = scheduler_.ToJson(); |
| return json; |
| } |
| |
| Pacemaker::Pacemaker( |
| absl::Duration interval, |
| std::unique_ptr<ShellCommandExecutor> shell_command_executor) |
| : shell_command_executor_(std::move(shell_command_executor)) { |
| scheduler_.RunAndScheduleAsync( |
| [this](absl::AnyInvocable<void()> OnDone) { |
| if (absl::Status status = PerformChecks(); !status.ok()) { |
| LOG(ERROR) << "Failed to perform checks: " << status; |
| } else { |
| // Dump the monitoring data to the syslog periodically. |
| LOG(WARNING) << "tlBMC Health stats:\n" |
| << GetMonitoringData().dump(2); |
| } |
| OnDone(); |
| }, |
| interval); |
| } |
| |
| } // namespace milotic_tlbmc |