| // SPDX-License-Identifier: GPL-2.0 |
| #define _GNU_SOURCE |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <limits.h> |
| #include <sched.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <linux/nsfs.h> |
| #include <sys/ioctl.h> |
| #include <sys/mman.h> |
| #include <sys/mount.h> |
| #include <sys/socket.h> |
| #include <sys/stat.h> |
| #include <sys/syscall.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <unistd.h> |
| #include "../kselftest_harness.h" |
| #include "../filesystems/utils.h" |
| #include "../pidfd/pidfd.h" |
| #include "wrappers.h" |
| |
| /* |
| * Test listns() error handling with invalid buffer addresses. |
| * |
| * When the buffer pointer is invalid (e.g., crossing page boundaries |
| * into unmapped memory), listns() returns EINVAL. |
| * |
| * This test also creates mount namespaces that get destroyed during |
| * iteration, testing that namespace cleanup happens outside the RCU |
| * read lock. |
| */ |
| TEST(listns_partial_fault_with_ns_cleanup) |
| { |
| void *map; |
| __u64 *ns_ids; |
| ssize_t ret; |
| long page_size; |
| pid_t pid, iter_pid; |
| int pidfds[5]; |
| int sv[5][2]; |
| int iter_pidfd; |
| int i, status; |
| char c; |
| |
| page_size = sysconf(_SC_PAGESIZE); |
| ASSERT_GT(page_size, 0); |
| |
| /* |
| * Map two pages: |
| * - First page: readable and writable |
| * - Second page: will be unmapped to trigger EFAULT |
| */ |
| map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, |
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
| ASSERT_NE(map, MAP_FAILED); |
| |
| /* Unmap the second page */ |
| ret = munmap((char *)map + page_size, page_size); |
| ASSERT_EQ(ret, 0); |
| |
| /* |
| * Position the buffer pointer so there's room for exactly one u64 |
| * before the page boundary. The second u64 would fall into the |
| * unmapped page. |
| */ |
| ns_ids = ((__u64 *)((char *)map + page_size)) - 1; |
| |
| /* |
| * Create a separate process to run listns() in a loop concurrently |
| * with namespace creation and destruction. |
| */ |
| iter_pid = create_child(&iter_pidfd, 0); |
| ASSERT_NE(iter_pid, -1); |
| |
| if (iter_pid == 0) { |
| struct ns_id_req req = { |
| .size = sizeof(req), |
| .spare = 0, |
| .ns_id = 0, |
| .ns_type = 0, /* All types */ |
| .spare2 = 0, |
| .user_ns_id = 0, /* Global listing */ |
| }; |
| int iter_ret; |
| |
| /* |
| * Loop calling listns() until killed. |
| * The kernel should: |
| * 1. Successfully write the first namespace ID (within valid page) |
| * 2. Fail with EFAULT when trying to write the second ID (unmapped page) |
| * 3. Handle concurrent namespace destruction without deadlock |
| */ |
| while (1) { |
| iter_ret = sys_listns(&req, ns_ids, 2, 0); |
| |
| if (iter_ret == -1 && errno == ENOSYS) |
| _exit(PIDFD_SKIP); |
| } |
| } |
| |
| /* Small delay to let iterator start looping */ |
| usleep(50000); |
| |
| /* |
| * Create several child processes, each in its own mount namespace. |
| * These will be destroyed while the iterator is running listns(). |
| */ |
| for (i = 0; i < 5; i++) { |
| /* Create socketpair for synchronization */ |
| ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); |
| |
| pid = create_child(&pidfds[i], CLONE_NEWNS); |
| ASSERT_NE(pid, -1); |
| |
| if (pid == 0) { |
| close(sv[i][0]); /* Close parent end */ |
| |
| if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) |
| _exit(1); |
| |
| /* Child: create a couple of tmpfs mounts */ |
| if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) |
| _exit(1); |
| if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) |
| _exit(1); |
| |
| if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) |
| _exit(1); |
| if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) |
| _exit(1); |
| |
| /* Signal parent that setup is complete */ |
| if (write_nointr(sv[i][1], "R", 1) != 1) |
| _exit(1); |
| |
| /* Wait for parent to signal us to exit */ |
| if (read_nointr(sv[i][1], &c, 1) != 1) |
| _exit(1); |
| |
| close(sv[i][1]); |
| _exit(0); |
| } |
| |
| close(sv[i][1]); /* Close child end */ |
| } |
| |
| /* Wait for all children to finish setup */ |
| for (i = 0; i < 5; i++) { |
| ret = read_nointr(sv[i][0], &c, 1); |
| ASSERT_EQ(ret, 1); |
| ASSERT_EQ(c, 'R'); |
| } |
| |
| /* |
| * Signal children to exit. This will destroy their mount namespaces |
| * while listns() is iterating the namespace tree. |
| * This tests that cleanup happens outside the RCU read lock. |
| */ |
| for (i = 0; i < 5; i++) |
| write_nointr(sv[i][0], "X", 1); |
| |
| /* Wait for all mount namespace children to exit and cleanup */ |
| for (i = 0; i < 5; i++) { |
| waitpid(-1, NULL, 0); |
| close(sv[i][0]); |
| close(pidfds[i]); |
| } |
| |
| /* Kill iterator and wait for it */ |
| sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); |
| ret = waitpid(iter_pid, &status, 0); |
| ASSERT_EQ(ret, iter_pid); |
| close(iter_pidfd); |
| |
| /* Should have been killed */ |
| ASSERT_TRUE(WIFSIGNALED(status)); |
| ASSERT_EQ(WTERMSIG(status), SIGKILL); |
| |
| /* Clean up */ |
| munmap(map, page_size); |
| } |
| |
| /* |
| * Test listns() error handling when the entire buffer is invalid. |
| * This is a sanity check that basic invalid pointer detection works. |
| */ |
| TEST(listns_complete_fault) |
| { |
| struct ns_id_req req = { |
| .size = sizeof(req), |
| .spare = 0, |
| .ns_id = 0, |
| .ns_type = 0, |
| .spare2 = 0, |
| .user_ns_id = 0, |
| }; |
| __u64 *ns_ids; |
| ssize_t ret; |
| |
| /* Use a clearly invalid pointer */ |
| ns_ids = (__u64 *)0xdeadbeef; |
| |
| ret = sys_listns(&req, ns_ids, 10, 0); |
| |
| if (ret == -1 && errno == ENOSYS) |
| SKIP(return, "listns() not supported"); |
| |
| /* Should fail with EFAULT */ |
| ASSERT_EQ(ret, -1); |
| ASSERT_EQ(errno, EFAULT); |
| } |
| |
| /* |
| * Test listns() error handling when the buffer is NULL. |
| */ |
| TEST(listns_null_buffer) |
| { |
| struct ns_id_req req = { |
| .size = sizeof(req), |
| .spare = 0, |
| .ns_id = 0, |
| .ns_type = 0, |
| .spare2 = 0, |
| .user_ns_id = 0, |
| }; |
| ssize_t ret; |
| |
| /* NULL buffer with non-zero count should fail */ |
| ret = sys_listns(&req, NULL, 10, 0); |
| |
| if (ret == -1 && errno == ENOSYS) |
| SKIP(return, "listns() not supported"); |
| |
| /* Should fail with EFAULT */ |
| ASSERT_EQ(ret, -1); |
| ASSERT_EQ(errno, EFAULT); |
| } |
| |
| /* |
| * Test listns() with a buffer that becomes invalid mid-iteration |
| * (after several successful writes), combined with mount namespace |
| * destruction to test RCU cleanup logic. |
| */ |
| TEST(listns_late_fault_with_ns_cleanup) |
| { |
| void *map; |
| __u64 *ns_ids; |
| ssize_t ret; |
| long page_size; |
| pid_t pid, iter_pid; |
| int pidfds[10]; |
| int sv[10][2]; |
| int iter_pidfd; |
| int i, status; |
| char c; |
| |
| page_size = sysconf(_SC_PAGESIZE); |
| ASSERT_GT(page_size, 0); |
| |
| /* Map two pages */ |
| map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, |
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
| ASSERT_NE(map, MAP_FAILED); |
| |
| /* Unmap the second page */ |
| ret = munmap((char *)map + page_size, page_size); |
| ASSERT_EQ(ret, 0); |
| |
| /* |
| * Position buffer so we can write several u64s successfully |
| * before hitting the page boundary. |
| */ |
| ns_ids = ((__u64 *)((char *)map + page_size)) - 5; |
| |
| /* |
| * Create a separate process to run listns() concurrently. |
| */ |
| iter_pid = create_child(&iter_pidfd, 0); |
| ASSERT_NE(iter_pid, -1); |
| |
| if (iter_pid == 0) { |
| struct ns_id_req req = { |
| .size = sizeof(req), |
| .spare = 0, |
| .ns_id = 0, |
| .ns_type = 0, |
| .spare2 = 0, |
| .user_ns_id = 0, |
| }; |
| int iter_ret; |
| |
| /* |
| * Loop calling listns() until killed. |
| * Request 10 namespace IDs while namespaces are being destroyed. |
| * This tests: |
| * 1. EFAULT handling when buffer becomes invalid |
| * 2. Namespace cleanup outside RCU read lock during iteration |
| */ |
| while (1) { |
| iter_ret = sys_listns(&req, ns_ids, 10, 0); |
| |
| if (iter_ret == -1 && errno == ENOSYS) |
| _exit(PIDFD_SKIP); |
| } |
| } |
| |
| /* Small delay to let iterator start looping */ |
| usleep(50000); |
| |
| /* |
| * Create more children with mount namespaces to increase the |
| * likelihood that namespace cleanup happens during iteration. |
| */ |
| for (i = 0; i < 10; i++) { |
| /* Create socketpair for synchronization */ |
| ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); |
| |
| pid = create_child(&pidfds[i], CLONE_NEWNS); |
| ASSERT_NE(pid, -1); |
| |
| if (pid == 0) { |
| close(sv[i][0]); /* Close parent end */ |
| |
| if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) |
| _exit(1); |
| |
| /* Child: create tmpfs mounts */ |
| if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) |
| _exit(1); |
| if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) |
| _exit(1); |
| |
| if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) |
| _exit(1); |
| if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) |
| _exit(1); |
| |
| /* Signal parent that setup is complete */ |
| if (write_nointr(sv[i][1], "R", 1) != 1) |
| _exit(1); |
| |
| /* Wait for parent to signal us to exit */ |
| if (read_nointr(sv[i][1], &c, 1) != 1) |
| _exit(1); |
| |
| close(sv[i][1]); |
| _exit(0); |
| } |
| |
| close(sv[i][1]); /* Close child end */ |
| } |
| |
| /* Wait for all children to finish setup */ |
| for (i = 0; i < 10; i++) { |
| ret = read_nointr(sv[i][0], &c, 1); |
| ASSERT_EQ(ret, 1); |
| ASSERT_EQ(c, 'R'); |
| } |
| |
| /* Kill half the children */ |
| for (i = 0; i < 5; i++) |
| write_nointr(sv[i][0], "X", 1); |
| |
| /* Small delay to let some exit */ |
| usleep(10000); |
| |
| /* Kill remaining children */ |
| for (i = 5; i < 10; i++) |
| write_nointr(sv[i][0], "X", 1); |
| |
| /* Wait for all children and cleanup */ |
| for (i = 0; i < 10; i++) { |
| waitpid(-1, NULL, 0); |
| close(sv[i][0]); |
| close(pidfds[i]); |
| } |
| |
| /* Kill iterator and wait for it */ |
| sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); |
| ret = waitpid(iter_pid, &status, 0); |
| ASSERT_EQ(ret, iter_pid); |
| close(iter_pidfd); |
| |
| /* Should have been killed */ |
| ASSERT_TRUE(WIFSIGNALED(status)); |
| ASSERT_EQ(WTERMSIG(status), SIGKILL); |
| |
| /* Clean up */ |
| munmap(map, page_size); |
| } |
| |
| /* |
| * Test specifically focused on mount namespace cleanup during EFAULT. |
| * Filter for mount namespaces only. |
| */ |
| TEST(listns_mnt_ns_cleanup_on_fault) |
| { |
| void *map; |
| __u64 *ns_ids; |
| ssize_t ret; |
| long page_size; |
| pid_t pid, iter_pid; |
| int pidfds[8]; |
| int sv[8][2]; |
| int iter_pidfd; |
| int i, status; |
| char c; |
| |
| page_size = sysconf(_SC_PAGESIZE); |
| ASSERT_GT(page_size, 0); |
| |
| /* Set up partial fault buffer */ |
| map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, |
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
| ASSERT_NE(map, MAP_FAILED); |
| |
| ret = munmap((char *)map + page_size, page_size); |
| ASSERT_EQ(ret, 0); |
| |
| /* Position for 3 successful writes, then fault */ |
| ns_ids = ((__u64 *)((char *)map + page_size)) - 3; |
| |
| /* |
| * Create a separate process to run listns() concurrently. |
| */ |
| iter_pid = create_child(&iter_pidfd, 0); |
| ASSERT_NE(iter_pid, -1); |
| |
| if (iter_pid == 0) { |
| struct ns_id_req req = { |
| .size = sizeof(req), |
| .spare = 0, |
| .ns_id = 0, |
| .ns_type = CLONE_NEWNS, /* Only mount namespaces */ |
| .spare2 = 0, |
| .user_ns_id = 0, |
| }; |
| int iter_ret; |
| |
| /* |
| * Loop calling listns() until killed. |
| * Call listns() to race with namespace destruction. |
| */ |
| while (1) { |
| iter_ret = sys_listns(&req, ns_ids, 10, 0); |
| |
| if (iter_ret == -1 && errno == ENOSYS) |
| _exit(PIDFD_SKIP); |
| } |
| } |
| |
| /* Small delay to let iterator start looping */ |
| usleep(50000); |
| |
| /* Create children with mount namespaces */ |
| for (i = 0; i < 8; i++) { |
| /* Create socketpair for synchronization */ |
| ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); |
| |
| pid = create_child(&pidfds[i], CLONE_NEWNS); |
| ASSERT_NE(pid, -1); |
| |
| if (pid == 0) { |
| close(sv[i][0]); /* Close parent end */ |
| |
| if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) |
| _exit(1); |
| |
| /* Do some mount operations to make cleanup more interesting */ |
| if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) |
| _exit(1); |
| if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) |
| _exit(1); |
| |
| if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) |
| _exit(1); |
| if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) |
| _exit(1); |
| |
| /* Signal parent that setup is complete */ |
| if (write_nointr(sv[i][1], "R", 1) != 1) |
| _exit(1); |
| |
| /* Wait for parent to signal us to exit */ |
| if (read_nointr(sv[i][1], &c, 1) != 1) |
| _exit(1); |
| |
| close(sv[i][1]); |
| _exit(0); |
| } |
| |
| close(sv[i][1]); /* Close child end */ |
| } |
| |
| /* Wait for all children to finish setup */ |
| for (i = 0; i < 8; i++) { |
| ret = read_nointr(sv[i][0], &c, 1); |
| ASSERT_EQ(ret, 1); |
| ASSERT_EQ(c, 'R'); |
| } |
| |
| /* Kill children to trigger namespace destruction during iteration */ |
| for (i = 0; i < 8; i++) |
| write_nointr(sv[i][0], "X", 1); |
| |
| /* Wait for children and cleanup */ |
| for (i = 0; i < 8; i++) { |
| waitpid(-1, NULL, 0); |
| close(sv[i][0]); |
| close(pidfds[i]); |
| } |
| |
| /* Kill iterator and wait for it */ |
| sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); |
| ret = waitpid(iter_pid, &status, 0); |
| ASSERT_EQ(ret, iter_pid); |
| close(iter_pidfd); |
| |
| /* Should have been killed */ |
| ASSERT_TRUE(WIFSIGNALED(status)); |
| ASSERT_EQ(WTERMSIG(status), SIGKILL); |
| |
| munmap(map, page_size); |
| } |
| |
| TEST_HARNESS_MAIN |