blob: 46ddb713f644ecddc23d10407e258b60484ad0d1 [file] [log] [blame] [edit]
/**
* @brief Integration tests for GPU telemetry
*
* SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#include "gpu-telemetry/client.h"
#include "server/server.hpp"
#include "server/mock_device.hpp"
#include "libnsm/base.h"
#include <gtest/gtest.h>
#include <thread>
#include <future>
#include <chrono>
using namespace testing;
using namespace gpu::telemetry;
using namespace std::chrono_literals;
class IntegrationTest : public Test {
protected:
void SetUp() override {
// Configure mock device
MockDeviceConfig deviceConfig;
deviceConfig.temperature.min = 30.0f;
deviceConfig.temperature.max = 80.0f;
deviceConfig.responseDelay = 10ms;
// Configure server
ServerConfig serverConfig;
serverConfig.socketPath = "/tmp/gpu-telemetry-test.sock";
serverConfig.maxClients = 5;
serverConfig.socketPerms = 0666;
// Create and start server
server_ = std::make_unique<Server>(
std::make_unique<MockDevice>(deviceConfig),
serverConfig);
serverFuture_ = std::async(std::launch::async, [this]() {
try {
stdexec::sync_wait(server_->start());
while (running_) {
std::this_thread::sleep_for(10ms);
}
stdexec::sync_wait(server_->stop());
}
catch (const std::exception& e) {
ADD_FAILURE() << "Server error: " << e.what();
}
});
// Wait for server to start
std::this_thread::sleep_for(100ms);
}
void TearDown() override {
running_ = false;
if (serverFuture_.valid()) {
serverFuture_.wait();
}
server_.reset();
unlink("/tmp/gpu-telemetry-test.sock");
}
// Helper to create temperature request
std::vector<uint8_t> createTempRequest() {
nsm_msg request = {};
encode_common_req(0x01, NSM_TYPE_TEMPERATURE,
NSM_GET_TEMPERATURE_READING, &request);
return std::vector<uint8_t>(
reinterpret_cast<uint8_t*>(&request),
reinterpret_cast<uint8_t*>(&request) + sizeof(request));
}
std::unique_ptr<Server> server_;
std::future<void> serverFuture_;
std::atomic<bool> running_{true};
std::vector<uint8_t> lastResponse_;
bool responseReceived_{false};
};
// Callback for client responses
void response_callback(void* user_data,
const uint8_t* response,
size_t response_len) {
auto test = static_cast<IntegrationTest*>(user_data);
test->lastResponse_.assign(response, response + response_len);
test->responseReceived_ = true;
}
TEST_F(IntegrationTest, SingleClient) {
// Initialize client
gpu_telemetry_ctx* ctx;
ASSERT_EQ(gpu_telemetry_init(&ctx), 0);
// Create and send temperature request
auto request = createTempRequest();
ASSERT_EQ(gpu_telemetry_send_message(ctx,
request.data(),
request.size(),
response_callback,
this), 0);
// Wait for response
for (int i = 0; i < 100 && !responseReceived_; i++) {
std::this_thread::sleep_for(10ms);
}
ASSERT_TRUE(responseReceived_);
// Verify response
ASSERT_GE(lastResponse_.size(), sizeof(nsm_msg));
const nsm_msg* response =
reinterpret_cast<const nsm_msg*>(lastResponse_.data());
ASSERT_EQ(response->header[0], NSM_SUCCESS);
uint16_t dataSize = *reinterpret_cast<const uint16_t*>(&response->header[1]);
ASSERT_EQ(dataSize, sizeof(float));
float temp = *reinterpret_cast<const float*>(response->payload);
ASSERT_GE(temp, 30.0f);
ASSERT_LE(temp, 80.0f);
gpu_telemetry_free(ctx);
}
TEST_F(IntegrationTest, MultipleClients) {
constexpr int NUM_CLIENTS = 3;
std::vector<gpu_telemetry_ctx*> clients(NUM_CLIENTS);
std::vector<bool> responses(NUM_CLIENTS, false);
// Initialize clients
for (int i = 0; i < NUM_CLIENTS; i++) {
ASSERT_EQ(gpu_telemetry_init(&clients[i]), 0);
}
// Send requests from all clients
auto request = createTempRequest();
for (int i = 0; i < NUM_CLIENTS; i++) {
ASSERT_EQ(gpu_telemetry_send_message(
clients[i],
request.data(),
request.size(),
[](void* user_data, const uint8_t*, size_t) {
auto received = static_cast<bool*>(user_data);
*received = true;
},
&responses[i]), 0);
}
// Wait for all responses
auto deadline = std::chrono::steady_clock::now() + 1s;
while (std::chrono::steady_clock::now() < deadline) {
if (std::all_of(responses.begin(), responses.end(),
[](bool r) { return r; })) {
break;
}
std::this_thread::sleep_for(10ms);
}
// Verify all responses received
for (int i = 0; i < NUM_CLIENTS; i++) {
EXPECT_TRUE(responses[i]) << "Client " << i << " didn't get response";
gpu_telemetry_free(clients[i]);
}
}
TEST_F(IntegrationTest, RapidRequests) {
gpu_telemetry_ctx* ctx;
ASSERT_EQ(gpu_telemetry_init(&ctx), 0);
// Send multiple rapid requests
constexpr int NUM_REQUESTS = 50;
std::vector<bool> responses(NUM_REQUESTS, false);
auto request = createTempRequest();
for (int i = 0; i < NUM_REQUESTS; i++) {
ASSERT_EQ(gpu_telemetry_send_message(
ctx,
request.data(),
request.size(),
[](void* user_data, const uint8_t*, size_t) {
auto received = static_cast<bool*>(user_data);
*received = true;
},
&responses[i]), 0);
}
// Wait for all responses
auto deadline = std::chrono::steady_clock::now() + 2s;
while (std::chrono::steady_clock::now() < deadline) {
if (std::all_of(responses.begin(), responses.end(),
[](bool r) { return r; })) {
break;
}
std::this_thread::sleep_for(10ms);
}
// Verify all responses received
EXPECT_TRUE(std::all_of(responses.begin(), responses.end(),
[](bool r) { return r; }));
gpu_telemetry_free(ctx);
}
TEST_F(IntegrationTest, ServerRestart) {
gpu_telemetry_ctx* ctx;
ASSERT_EQ(gpu_telemetry_init(&ctx), 0);
// Stop server
running_ = false;
serverFuture_.wait();
// Restart server
running_ = true;
serverFuture_ = std::async(std::launch::async, [this]() {
try {
stdexec::sync_wait(server_->start());
while (running_) {
std::this_thread::sleep_for(10ms);
}
stdexec::sync_wait(server_->stop());
}
catch (const std::exception& e) {
ADD_FAILURE() << "Server error: " << e.what();
}
});
std::this_thread::sleep_for(100ms);
// Send request after restart
auto request = createTempRequest();
ASSERT_EQ(gpu_telemetry_send_message(ctx,
request.data(),
request.size(),
response_callback,
this), 0);
// Wait for response
for (int i = 0; i < 100 && !responseReceived_; i++) {
std::this_thread::sleep_for(10ms);
}
ASSERT_TRUE(responseReceived_);
gpu_telemetry_free(ctx);
}