blob: 4bbbe4aed24ad51998332228108cb6aed3fbe6d6 [file] [log] [blame]
#include "tlbmc/pacemaker/pacemaker.h"
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <memory>
#include <string>
#include <utility>
#include "absl/functional/any_invocable.h"
#include "absl/log/log.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/ascii.h"
#include "absl/strings/match.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "g3/macros.h"
#include "nlohmann/json.hpp"
namespace milotic_tlbmc {
namespace {
constexpr const int kMaxConsecutiveRestartAttempts = 10;
constexpr const char* kProcessName = "bmcweb";
constexpr const int kExpectedListeningPort = 443;
constexpr const int kMemoryUsageThreshold = 300 * 1024 * 1024; // 300MB
constexpr const char* kIsActiveCommand = "systemctl is-active %s";
constexpr const char* kActiveStatus = "active";
constexpr const char* kRestartCommand = "systemctl restart %s";
constexpr const char* kIsListeningCommand = "ss -tulpn | grep %d";
constexpr const char* kGetMemoryUsageCommand =
"cat /proc/%d/status | grep VmRSS | awk '{print $2}'";
constexpr const char* kGetPidCommand =
"systemctl status %s | grep 'Main PID:' | awk '{print $3}'";
constexpr const char* kGetCpuUsageCommand =
"top -n1 | grep %s -i | awk '{print int($8)}'";
constexpr const char* kGetActiveEnterTimestampMonotonic =
"systemctl show --property=ActiveEnterTimestampMonotonic %s | awk -F'=' "
"'{print $2}'";
constexpr const int kMaxRecentDataPoints = 12;
} // namespace
nlohmann::json Pacemaker::ErrorInfo::ToJson() const {
auto error_type_to_string = [](ErrorType type) -> std::string {
switch (type) {
case ErrorType::kUnknown:
return "Unknown";
case ErrorType::kServiceInactive:
return "ServiceInactive";
case ErrorType::kPortNotListening:
return "PortNotListening";
case ErrorType::kMemoryUsageAboveThreshold:
return "MemoryUsageAboveThreshold";
default:
return "Unknown";
}
};
nlohmann::json json;
json["type"] = error_type_to_string(type);
json["timestamp"] = absl::FormatTime(timestamp);
return json;
}
nlohmann::json Pacemaker::MonitoredData::ToJson() const {
nlohmann::json json;
// Get the last kMaxRecentDataPoints memory usage data points.
auto it = memory_usage_bytes.rbegin();
for (int i = 0; i < kMaxRecentDataPoints && it != memory_usage_bytes.rend();
++i, ++it) {
json["MemoryUsageRecentToOldest"].push_back(*it);
}
// Get the last kMaxRecentDataPoints error data points.
auto it_error = restart_log.rbegin();
for (int i = 0; i < kMaxRecentDataPoints && it_error != restart_log.rend();
++i, ++it_error) {
json["ErrorsRecentToOldest"].push_back(it_error->ToJson());
}
json["CpuUsage"] = cpu_usage;
json["MemoryUsage"] =
memory_usage_bytes.empty() ? -1 : memory_usage_bytes.back();
json["LastActiveTimestamp"] = last_active_timestamp;
json["LastResetTime"] = absl::FormatTime(last_reset_time);
json["RestartTriggered"] = restart_triggered;
json["Pid"] = pid;
json["ConsecutiveRestartAttempts"] = consecutive_restart_attempts;
return json;
}
absl::StatusOr<std::string> ShellCommandExecutor::Execute(
const std::string& command) {
FILE* pipe = popen(command.c_str(), "r");
if (pipe == nullptr) {
return absl::InternalError(
absl::StrCat("Error: Failed to execute command: ", command));
}
std::string result;
char* line = nullptr;
size_t len = 0;
ssize_t read;
while ((read = getline(&line, &len, pipe)) != -1) {
result.append(line, static_cast<std::string::size_type>(read));
}
free(line);
pclose(pipe);
return result;
}
absl::StatusOr<bool> Pacemaker::IsServiceActive(
const std::string& service_name) const {
LOG(INFO) << "Checking if service " << service_name << " is active.";
ECCLESIA_ASSIGN_OR_RETURN(
std::string output, shell_command_executor_->Execute(
absl::StrFormat(kIsActiveCommand, service_name)));
absl::StripAsciiWhitespace(&output);
// Compare string
if (!absl::EqualsIgnoreCase(output, "active")) {
LOG(ERROR) << "Service " << service_name << " is not active."
<< " status is " << output << " instead of " << kActiveStatus;
return false;
}
return true;
}
absl::StatusOr<bool> Pacemaker::IsPortListening(int port) const {
LOG(INFO) << "Checking if port " << port << " is listening.";
ECCLESIA_ASSIGN_OR_RETURN(std::string output,
shell_command_executor_->Execute(
absl::StrFormat(kIsListeningCommand, port)));
return !output.empty();
}
absl::StatusOr<int> Pacemaker::GetMemoryUsage(int pid) const {
LOG(INFO) << "Getting memory usage for pid " << pid;
ECCLESIA_ASSIGN_OR_RETURN(std::string output,
shell_command_executor_->Execute(
absl::StrFormat(kGetMemoryUsageCommand, pid)));
int memory_usage;
if (!absl::SimpleAtoi(output, &memory_usage)) {
return absl::InternalError(
absl::StrCat("Error: Failed to parse memory usage: ", output));
}
return memory_usage;
}
absl::StatusOr<int> Pacemaker::GetCpuUsage(
const std::string& process_name) const {
LOG(INFO) << "Getting CPU usage for process " << process_name;
ECCLESIA_ASSIGN_OR_RETURN(std::string output,
shell_command_executor_->Execute(absl::StrFormat(
kGetCpuUsageCommand, process_name)));
int cpu_usage = -1;
if (output.empty()) {
return cpu_usage;
}
if (!absl::SimpleAtoi(output, &cpu_usage)) {
return absl::InternalError(
absl::StrCat("Error: Failed to parse CPU usage: ", output));
}
return cpu_usage;
}
absl::StatusOr<int64_t> Pacemaker::GetLastActiveTimestamp(
const std::string& process_name) const {
LOG(INFO) << "Getting last active timestamp for process " << process_name;
ECCLESIA_ASSIGN_OR_RETURN(
std::string output,
shell_command_executor_->Execute(
absl::StrFormat(kGetActiveEnterTimestampMonotonic, process_name)));
int64_t last_active_timestamp;
if (!absl::SimpleAtoi(output, &last_active_timestamp)) {
return absl::InternalError(
absl::StrCat("Error: Failed to parse last reset time: ", output));
}
return last_active_timestamp;
}
absl::Status Pacemaker::RestartService(const std::string& service_name) {
LOG(ERROR) << "Restarting service " << service_name;
{
absl::MutexLock lock(&mutex_);
if (monitored_data_.restart_triggered) {
if (monitored_data_.consecutive_restart_attempts >=
kMaxConsecutiveRestartAttempts) {
LOG(ERROR) << "Restart attempts exceeded the limit of "
<< kMaxConsecutiveRestartAttempts;
return absl::InternalError(
absl::StrCat("Error: Restart attempts exceeded the limit of ",
kMaxConsecutiveRestartAttempts));
}
++monitored_data_.consecutive_restart_attempts;
}
monitored_data_.restart_triggered = true;
monitored_data_.last_reset_time = absl::Now();
}
ECCLESIA_ASSIGN_OR_RETURN(
std::string output, shell_command_executor_->Execute(
absl::StrFormat(kRestartCommand, service_name)));
return absl::OkStatus();
}
// Get PID of the process.
absl::StatusOr<int> Pacemaker::GetPid(const std::string& process_name) const {
LOG(INFO) << "Getting PID for process " << process_name;
ECCLESIA_ASSIGN_OR_RETURN(std::string output,
shell_command_executor_->Execute(
absl::StrFormat(kGetPidCommand, process_name)));
int pid = -1;
if (output.empty()) {
return absl::InternalError(
absl::StrCat("Error: Main PID not found for service: ", process_name));
}
// Trim leading and trailing whitespace, including newlines.
output = std::string(absl::StripAsciiWhitespace(output));
if (absl::SimpleAtoi(output, &pid)) {
return pid;
}
return absl::InternalError(
absl::StrCat("Error: Invalid PID found in systemctl output: ", output,
" for service: ", process_name));
}
void Pacemaker::RecordError(ErrorType type) {
absl::MutexLock lock(&mutex_);
// If the error type is unknown, we can't determine the timestamp.
// So we just record the error and continue.
if (type == ErrorType::kUnknown) {
monitored_data_.restart_log.push_back({type, absl::InfinitePast()});
return;
}
ErrorInfo error_info = {type, absl::Now()};
LOG(ERROR) << "Restart required due to Error:\n"
<< error_info.ToJson().dump(2);
monitored_data_.restart_log.push_back(error_info);
}
absl::Status Pacemaker::PerformChecks() {
bool unknown_restart_detected = false;
ECCLESIA_ASSIGN_OR_RETURN(bool is_active, IsServiceActive(kProcessName));
ECCLESIA_ASSIGN_OR_RETURN(bool is_listening,
IsPortListening(kExpectedListeningPort));
ECCLESIA_ASSIGN_OR_RETURN(int pid, GetPid(kProcessName));
ECCLESIA_ASSIGN_OR_RETURN(int cpu_usage, GetCpuUsage(kProcessName));
int memory_usage = -1;
if (pid > 0) {
ECCLESIA_ASSIGN_OR_RETURN(memory_usage, GetMemoryUsage(pid));
}
absl::StatusOr<int64_t> last_active_timestamp =
GetLastActiveTimestamp(kProcessName);
if (!last_active_timestamp.ok()) {
LOG(ERROR) << "Failed to get last active timestamp: "
<< last_active_timestamp.status();
}
// Update monitored data.
{
absl::MutexLock lock(&mutex_);
monitored_data_.memory_usage_bytes.push_back(memory_usage);
monitored_data_.cpu_usage = cpu_usage;
monitored_data_.pid = pid;
// Check if the process was restarted outside of pacemaker.
if (last_active_timestamp.ok()) {
// We check if the `last_active_timestamp` we got from systemctl
// is different from the one we got from the previous run and if the
// process was not restarted by pacemaker.
// If the timestamps are different, it means the process was restarted
// outside of pacemaker and we should record it as an unknown error.
if (monitored_data_.last_active_timestamp != -1 &&
*last_active_timestamp != monitored_data_.last_active_timestamp &&
!monitored_data_.restart_triggered) {
LOG(INFO) << "Process " << kProcessName
<< " was restarted outside of pacemaker.";
unknown_restart_detected = true;
}
monitored_data_.last_active_timestamp = *last_active_timestamp;
}
}
if (unknown_restart_detected) {
RecordError(ErrorType::kUnknown);
}
if (!is_active) {
RecordError(ErrorType::kServiceInactive);
ECCLESIA_RETURN_IF_ERROR(RestartService(kProcessName));
return absl::OkStatus();
}
if (!is_listening) {
LOG(ERROR) << "Process " << kProcessName << " is not listening on port "
<< kExpectedListeningPort;
RecordError(ErrorType::kPortNotListening);
ECCLESIA_RETURN_IF_ERROR(RestartService(kProcessName));
return absl::OkStatus();
}
if (memory_usage > kMemoryUsageThreshold) {
LOG(ERROR) << "Process " << kProcessName << " memory usage is "
<< memory_usage << " bytes which is more than the threshold of "
<< kMemoryUsageThreshold;
RecordError(ErrorType::kMemoryUsageAboveThreshold);
ECCLESIA_RETURN_IF_ERROR(RestartService(kProcessName));
return absl::OkStatus();
}
// If we reach here, it means all checks passed.
// Reset the restart triggered and consecutive restart attempts counters.
absl::MutexLock lock(&mutex_);
monitored_data_.restart_triggered = false;
monitored_data_.consecutive_restart_attempts = 0;
return absl::OkStatus();
}
nlohmann::json Pacemaker::GetMonitoringData() const {
nlohmann::json json;
{
absl::MutexLock lock(&mutex_);
json = monitored_data_.ToJson();
}
json["PacemakerSchedulerStats"] = scheduler_.ToJson();
return json;
}
Pacemaker::Pacemaker(
absl::Duration interval,
std::unique_ptr<ShellCommandExecutor> shell_command_executor)
: shell_command_executor_(std::move(shell_command_executor)) {
scheduler_.RunAndScheduleAsync(
[this](absl::AnyInvocable<void()> OnDone) {
if (absl::Status status = PerformChecks(); !status.ok()) {
LOG(ERROR) << "Failed to perform checks: " << status;
} else {
// Dump the monitoring data to the syslog periodically.
LOG(WARNING) << "tlBMC Health stats:\n"
<< GetMonitoringData().dump(2);
}
OnDone();
},
interval);
}
} // namespace milotic_tlbmc