blob: 75f1da05255f5e7e4be1f7d29d74ee9748dad2a3 [file] [log] [blame]
#include "bmc/remote_state_ping.h"
#include <iterator>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "bmc/daemon_context_bmc.h"
#include "daemon_context.h"
#include "safepower_agent.pb.h"
#include "safepower_agent_config.pb.h"
#include "state_updater.h"
#include "one/network_interfaces.pb.h"
#include "one/offline_node_entities.pb.h"
#include "one/resolved_entities.pb.h"
#include "absl/functional/bind_front.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/time.h"
// NOLINTBEGIN(readability/boost) // Runs on BMC.
#include "boost/process.hpp"
#include "boost/process/args.hpp"
#include "boost/system.hpp"
// NOLINTEND(readability/boost)
namespace safepower_agent {
PingStateMonitor::PingStateMonitor(
std::string entity_tag, std::string hostname,
safepower_agent_config::PingConfig ping_config,
std::shared_ptr<StateUpdater<safepower_agent_proto::SystemState>>
system_state_updater)
: entity_tag_(std::move(entity_tag)),
hostname_(std::move(hostname)),
command_({std::move(*ping_config.mutable_command())}),
interval_(absl::Milliseconds(ping_config.interval_ms())),
system_state_updater_(std::move(system_state_updater)),
args_(std::make_move_iterator(ping_config.mutable_extra_args()->begin()),
std::make_move_iterator(ping_config.mutable_extra_args()->end())) {
args_.push_back("-c");
args_.push_back(absl::StrCat(ping_config.count_per_check()));
args_.push_back(hostname_);
LOG(INFO) << "Ping command: " << command_ << " " << absl::StrJoin(args_, " ");
}
PingStateMonitor::PingStateMonitor(PingStateMonitor&& other)
: entity_tag_(std::move(other.entity_tag_)),
hostname_(std::move(other.hostname_)),
command_(std::move(other.command_)),
interval_(std::move(other.interval_)),
system_state_updater_(std::move(other.system_state_updater_)),
args_(std::move(other.args_)),
reachable_state_(other.reachable_state_) {
CHECK(!other.Started()) << "PingStateMonitor moved while started";
}
PingStateMonitor& PingStateMonitor::operator=(PingStateMonitor&& other) {
if (Started()) {
LOG(DFATAL) << "PingStateMonitor replaced while started";
return *this;
}
if (other.Started()) {
LOG(DFATAL) << "PingStateMonitor moved while started";
return *this;
}
entity_tag_ = std::move(other.entity_tag_);
hostname_ = std::move(other.hostname_);
command_ = std::move(other.command_);
interval_ = std::move(other.interval_);
system_state_updater_ = std::move(other.system_state_updater_);
args_ = std::move(other.args_);
reachable_state_ = other.reachable_state_;
return *this;
}
bool PingStateMonitor::Started() const {
absl::MutexLock lock(task_mutex_);
return !task_name_.empty() || activate_callback_handle_.pending() ||
idle_callback_handle_.pending();
}
void PingStateMonitor::StopTimer() {
if (task_name_.empty()) {
return;
}
LOG(INFO) << "Stopping ping state monitor for " << entity_tag_;
absl::Status status = DaemonContext::Get().scheduler().CancelCall(task_name_);
if (!status.ok()) {
LOG(WARNING) << "Failed to cancel ping state monitor: " << status;
}
task_name_.clear();
}
void PingStateMonitor::StartTimer(absl::Duration interval) {
LOG(INFO) << "Starting ping state monitor for " << entity_tag_
<< " with interval " << interval;
std::string task_name = absl::StrCat("ping_state_monitor_", entity_tag_);
absl::Status status = DaemonContext::Get().scheduler().PeriodicCall(
absl::bind_front(&PingStateMonitor::Ping, this), interval, task_name);
if (!status.ok()) {
LOG(DFATAL) << "Failed to start ping state monitor: " << status;
return;
}
task_name_ = std::move(task_name);
}
void PingStateMonitor::Pause() {
{
absl::MutexLock lock(task_mutex_);
if (task_name_.empty()) {
LOG(DFATAL) << "PingStateMonitor paused while task not started";
}
StopTimer();
// When the state updater is paused, we don't know the current state of the
// node. e.g. it is possible that the node goes down or comes back up while
// the updater is paused. When a new listener is added, setting an unknown
// state ensures that we never report a stale state. Since state comparisons
// are usually equality checks, this should not break any assumptions.
UpdateReachableState(ReachableState::kUnknown);
}
activate_callback_handle_ = system_state_updater_->OnActive(
absl::bind_front(&PingStateMonitor::Resume, this));
}
void PingStateMonitor::Resume() {
{
absl::MutexLock lock(task_mutex_);
if (!task_name_.empty()) {
LOG(DFATAL) << "PingStateMonitor resumed while task started";
return;
}
StartTimer(interval_);
}
idle_callback_handle_ = system_state_updater_->OnIdle(
absl::bind_front(&PingStateMonitor::Pause, this));
}
absl::Status PingStateMonitor::Start() {
// Will start immediately if the state updater is idle.
activate_callback_handle_ = system_state_updater_->OnActive(
absl::bind_front(&PingStateMonitor::Resume, this));
return absl::OkStatus();
}
PingStateMonitor::~PingStateMonitor() {
idle_callback_handle_.TryCancel();
activate_callback_handle_.TryCancel();
if (!task_mutex_.try_lock()) {
LOG(DFATAL) << "PingStateMonitor destroyed while mutex was held";
task_mutex_.lock();
}
StopTimer();
task_mutex_.unlock();
}
// TODO(jebr): b/450140775 - occasionally log the ping command output
void PingStateMonitor::Ping() {
boost::process::async_system(
DaemonContextBMC::Get().get_io_context(),
absl::bind_front(&PingStateMonitor::ProcessPingResult, this),
boost::process::exe = command_, boost::process::args = args_,
(boost::process::std_out & boost::process::std_err) >
boost::process::null);
}
void PingStateMonitor::ProcessPingResult(boost::system::error_code ec,
int exit_code) {
if (ec) {
LOG(ERROR) << "Failed to ping " << hostname_ << ": " << ec.message();
return;
}
ReachableState new_reachable_state = exit_code == 0
? ReachableState::kReachable
: ReachableState::kUnreachable;
UpdateReachableState(new_reachable_state);
}
void PingStateMonitor::UpdateReachableState(
ReachableState new_reachable_state) {
if (new_reachable_state == reachable_state_) {
return;
}
safepower_agent_proto::BootStateSpecifier state;
switch (new_reachable_state) {
case ReachableState::kUnknown:
LOG(INFO) << entity_tag_ << " is reachability state is unknown";
state = safepower_agent_proto::BOOT_STATE_UNSPECIFIED;
break;
case ReachableState::kReachable:
LOG(INFO) << entity_tag_ << " is reachable";
state = safepower_agent_proto::BOOT_STATE_BOOTING;
break;
case ReachableState::kUnreachable:
LOG(INFO) << entity_tag_ << " is unreachable";
state = safepower_agent_proto::BOOT_STATE_NOT_BOOTED;
break;
default:
LOG(DFATAL) << "Unknown reachable state: "
<< static_cast<int>(new_reachable_state);
return;
}
safepower_agent_proto::SystemState new_state;
(*new_state.mutable_node_state())[entity_tag_]
.mutable_boot_state()
->set_state(state);
system_state_updater_->UpdateState(new_state);
reachable_state_ = new_reachable_state;
}
} // namespace safepower_agent