| // SPDX-License-Identifier: GPL-2.0 |
| #ifndef _GNU_SOURCE |
| #define _GNU_SOURCE |
| #endif |
| #include <fcntl.h> |
| #include <sys/types.h> |
| #include <dirent.h> |
| #include <grp.h> |
| #include <linux/limits.h> |
| #include <sched.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/eventfd.h> |
| #include <sys/fsuid.h> |
| #include <sys/prctl.h> |
| #include <sys/socket.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <sys/xattr.h> |
| #include <sys/mount.h> |
| |
| #include "../kselftest.h" |
| #include "wrappers.h" |
| #include "utils.h" |
| |
| #define MAX_USERNS_LEVEL 32 |
| |
| #define syserror(format, ...) \ |
| ({ \ |
| fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \ |
| (-errno); \ |
| }) |
| |
| #define syserror_set(__ret__, format, ...) \ |
| ({ \ |
| typeof(__ret__) __internal_ret__ = (__ret__); \ |
| errno = labs(__ret__); \ |
| fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \ |
| __internal_ret__; \ |
| }) |
| |
| #define STRLITERALLEN(x) (sizeof(""x"") - 1) |
| |
| #define INTTYPE_TO_STRLEN(type) \ |
| (2 + (sizeof(type) <= 1 \ |
| ? 3 \ |
| : sizeof(type) <= 2 \ |
| ? 5 \ |
| : sizeof(type) <= 4 \ |
| ? 10 \ |
| : sizeof(type) <= 8 ? 20 : sizeof(int[-2 * (sizeof(type) > 8)]))) |
| |
| #define list_for_each(__iterator, __list) \ |
| for (__iterator = (__list)->next; __iterator != __list; __iterator = __iterator->next) |
| |
| typedef enum idmap_type_t { |
| ID_TYPE_UID, |
| ID_TYPE_GID |
| } idmap_type_t; |
| |
| struct id_map { |
| idmap_type_t map_type; |
| __u32 nsid; |
| __u32 hostid; |
| __u32 range; |
| }; |
| |
| struct list { |
| void *elem; |
| struct list *next; |
| struct list *prev; |
| }; |
| |
| struct userns_hierarchy { |
| int fd_userns; |
| int fd_event; |
| unsigned int level; |
| struct list id_map; |
| }; |
| |
| static inline void list_init(struct list *list) |
| { |
| list->elem = NULL; |
| list->next = list->prev = list; |
| } |
| |
| static inline int list_empty(const struct list *list) |
| { |
| return list == list->next; |
| } |
| |
| static inline void __list_add(struct list *new, struct list *prev, struct list *next) |
| { |
| next->prev = new; |
| new->next = next; |
| new->prev = prev; |
| prev->next = new; |
| } |
| |
| static inline void list_add_tail(struct list *head, struct list *list) |
| { |
| __list_add(list, head->prev, head); |
| } |
| |
| static inline void list_del(struct list *list) |
| { |
| struct list *next, *prev; |
| |
| next = list->next; |
| prev = list->prev; |
| next->prev = prev; |
| prev->next = next; |
| } |
| |
| static ssize_t read_nointr(int fd, void *buf, size_t count) |
| { |
| ssize_t ret; |
| |
| do { |
| ret = read(fd, buf, count); |
| } while (ret < 0 && errno == EINTR); |
| |
| return ret; |
| } |
| |
| static ssize_t write_nointr(int fd, const void *buf, size_t count) |
| { |
| ssize_t ret; |
| |
| do { |
| ret = write(fd, buf, count); |
| } while (ret < 0 && errno == EINTR); |
| |
| return ret; |
| } |
| |
| #define __STACK_SIZE (8 * 1024 * 1024) |
| static pid_t do_clone(int (*fn)(void *), void *arg, int flags) |
| { |
| void *stack; |
| |
| stack = malloc(__STACK_SIZE); |
| if (!stack) |
| return -ENOMEM; |
| |
| #ifdef __ia64__ |
| return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL); |
| #else |
| return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL); |
| #endif |
| } |
| |
| static int get_userns_fd_cb(void *data) |
| { |
| for (;;) |
| pause(); |
| _exit(0); |
| } |
| |
| static int wait_for_pid(pid_t pid) |
| { |
| int status, ret; |
| |
| again: |
| ret = waitpid(pid, &status, 0); |
| if (ret == -1) { |
| if (errno == EINTR) |
| goto again; |
| |
| return -1; |
| } |
| |
| if (!WIFEXITED(status)) |
| return -1; |
| |
| return WEXITSTATUS(status); |
| } |
| |
| static int write_id_mapping(idmap_type_t map_type, pid_t pid, const char *buf, size_t buf_size) |
| { |
| int fd = -EBADF, setgroups_fd = -EBADF; |
| int fret = -1; |
| int ret; |
| char path[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) + |
| STRLITERALLEN("/setgroups") + 1]; |
| |
| if (geteuid() != 0 && map_type == ID_TYPE_GID) { |
| ret = snprintf(path, sizeof(path), "/proc/%d/setgroups", pid); |
| if (ret < 0 || ret >= sizeof(path)) |
| goto out; |
| |
| setgroups_fd = open(path, O_WRONLY | O_CLOEXEC); |
| if (setgroups_fd < 0 && errno != ENOENT) { |
| syserror("Failed to open \"%s\"", path); |
| goto out; |
| } |
| |
| if (setgroups_fd >= 0) { |
| ret = write_nointr(setgroups_fd, "deny\n", STRLITERALLEN("deny\n")); |
| if (ret != STRLITERALLEN("deny\n")) { |
| syserror("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid); |
| goto out; |
| } |
| } |
| } |
| |
| ret = snprintf(path, sizeof(path), "/proc/%d/%cid_map", pid, map_type == ID_TYPE_UID ? 'u' : 'g'); |
| if (ret < 0 || ret >= sizeof(path)) |
| goto out; |
| |
| fd = open(path, O_WRONLY | O_CLOEXEC); |
| if (fd < 0) { |
| syserror("Failed to open \"%s\"", path); |
| goto out; |
| } |
| |
| ret = write_nointr(fd, buf, buf_size); |
| if (ret != buf_size) { |
| syserror("Failed to write %cid mapping to \"%s\"", |
| map_type == ID_TYPE_UID ? 'u' : 'g', path); |
| goto out; |
| } |
| |
| fret = 0; |
| out: |
| close(fd); |
| close(setgroups_fd); |
| |
| return fret; |
| } |
| |
| static int map_ids_from_idmap(struct list *idmap, pid_t pid) |
| { |
| int fill, left; |
| char mapbuf[4096] = {}; |
| bool had_entry = false; |
| idmap_type_t map_type, u_or_g; |
| |
| if (list_empty(idmap)) |
| return 0; |
| |
| for (map_type = ID_TYPE_UID, u_or_g = 'u'; |
| map_type <= ID_TYPE_GID; map_type++, u_or_g = 'g') { |
| char *pos = mapbuf; |
| int ret; |
| struct list *iterator; |
| |
| |
| list_for_each(iterator, idmap) { |
| struct id_map *map = iterator->elem; |
| if (map->map_type != map_type) |
| continue; |
| |
| had_entry = true; |
| |
| left = 4096 - (pos - mapbuf); |
| fill = snprintf(pos, left, "%u %u %u\n", map->nsid, map->hostid, map->range); |
| /* |
| * The kernel only takes <= 4k for writes to |
| * /proc/<pid>/{g,u}id_map |
| */ |
| if (fill <= 0 || fill >= left) |
| return syserror_set(-E2BIG, "Too many %cid mappings defined", u_or_g); |
| |
| pos += fill; |
| } |
| if (!had_entry) |
| continue; |
| |
| ret = write_id_mapping(map_type, pid, mapbuf, pos - mapbuf); |
| if (ret < 0) |
| return syserror("Failed to write mapping: %s", mapbuf); |
| |
| memset(mapbuf, 0, sizeof(mapbuf)); |
| } |
| |
| return 0; |
| } |
| |
| static int get_userns_fd_from_idmap(struct list *idmap) |
| { |
| int ret; |
| pid_t pid; |
| char path_ns[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) + |
| STRLITERALLEN("/ns/user") + 1]; |
| |
| pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER | CLONE_NEWNS); |
| if (pid < 0) |
| return -errno; |
| |
| ret = map_ids_from_idmap(idmap, pid); |
| if (ret < 0) |
| return ret; |
| |
| ret = snprintf(path_ns, sizeof(path_ns), "/proc/%d/ns/user", pid); |
| if (ret < 0 || (size_t)ret >= sizeof(path_ns)) |
| ret = -EIO; |
| else |
| ret = open(path_ns, O_RDONLY | O_CLOEXEC | O_NOCTTY); |
| |
| (void)kill(pid, SIGKILL); |
| (void)wait_for_pid(pid); |
| return ret; |
| } |
| |
| int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range) |
| { |
| struct list head, uid_mapl, gid_mapl; |
| struct id_map uid_map = { |
| .map_type = ID_TYPE_UID, |
| .nsid = nsid, |
| .hostid = hostid, |
| .range = range, |
| }; |
| struct id_map gid_map = { |
| .map_type = ID_TYPE_GID, |
| .nsid = nsid, |
| .hostid = hostid, |
| .range = range, |
| }; |
| |
| list_init(&head); |
| uid_mapl.elem = &uid_map; |
| gid_mapl.elem = &gid_map; |
| list_add_tail(&head, &uid_mapl); |
| list_add_tail(&head, &gid_mapl); |
| |
| return get_userns_fd_from_idmap(&head); |
| } |
| |
| bool switch_ids(uid_t uid, gid_t gid) |
| { |
| if (setgroups(0, NULL)) |
| return syserror("failure: setgroups"); |
| |
| if (setresgid(gid, gid, gid)) |
| return syserror("failure: setresgid"); |
| |
| if (setresuid(uid, uid, uid)) |
| return syserror("failure: setresuid"); |
| |
| /* Ensure we can access proc files from processes we can ptrace. */ |
| if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0)) |
| return syserror("failure: make dumpable"); |
| |
| return true; |
| } |
| |
| static int create_userns_hierarchy(struct userns_hierarchy *h); |
| |
| static int userns_fd_cb(void *data) |
| { |
| struct userns_hierarchy *h = data; |
| char c; |
| int ret; |
| |
| ret = read_nointr(h->fd_event, &c, 1); |
| if (ret < 0) |
| return syserror("failure: read from socketpair"); |
| |
| /* Only switch ids if someone actually wrote a mapping for us. */ |
| if (c == '1') { |
| if (!switch_ids(0, 0)) |
| return syserror("failure: switch ids to 0"); |
| } |
| |
| ret = write_nointr(h->fd_event, "1", 1); |
| if (ret < 0) |
| return syserror("failure: write to socketpair"); |
| |
| ret = create_userns_hierarchy(++h); |
| if (ret < 0) |
| return syserror("failure: userns level %d", h->level); |
| |
| return 0; |
| } |
| |
| static int create_userns_hierarchy(struct userns_hierarchy *h) |
| { |
| int fret = -1; |
| char c; |
| int fd_socket[2]; |
| int fd_userns = -EBADF, ret = -1; |
| ssize_t bytes; |
| pid_t pid; |
| char path[256]; |
| |
| if (h->level == MAX_USERNS_LEVEL) |
| return 0; |
| |
| ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fd_socket); |
| if (ret < 0) |
| return syserror("failure: create socketpair"); |
| |
| /* Note the CLONE_FILES | CLONE_VM when mucking with fds and memory. */ |
| h->fd_event = fd_socket[1]; |
| pid = do_clone(userns_fd_cb, h, CLONE_NEWUSER | CLONE_FILES | CLONE_VM); |
| if (pid < 0) { |
| syserror("failure: userns level %d", h->level); |
| goto out_close; |
| } |
| |
| ret = map_ids_from_idmap(&h->id_map, pid); |
| if (ret < 0) { |
| kill(pid, SIGKILL); |
| syserror("failure: writing id mapping for userns level %d for %d", h->level, pid); |
| goto out_wait; |
| } |
| |
| if (!list_empty(&h->id_map)) |
| bytes = write_nointr(fd_socket[0], "1", 1); /* Inform the child we wrote a mapping. */ |
| else |
| bytes = write_nointr(fd_socket[0], "0", 1); /* Inform the child we didn't write a mapping. */ |
| if (bytes < 0) { |
| kill(pid, SIGKILL); |
| syserror("failure: write to socketpair"); |
| goto out_wait; |
| } |
| |
| /* Wait for child to set*id() and become dumpable. */ |
| bytes = read_nointr(fd_socket[0], &c, 1); |
| if (bytes < 0) { |
| kill(pid, SIGKILL); |
| syserror("failure: read from socketpair"); |
| goto out_wait; |
| } |
| |
| snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); |
| fd_userns = open(path, O_RDONLY | O_CLOEXEC); |
| if (fd_userns < 0) { |
| kill(pid, SIGKILL); |
| syserror("failure: open userns level %d for %d", h->level, pid); |
| goto out_wait; |
| } |
| |
| fret = 0; |
| |
| out_wait: |
| if (!wait_for_pid(pid) && !fret) { |
| h->fd_userns = fd_userns; |
| fd_userns = -EBADF; |
| } |
| |
| out_close: |
| if (fd_userns >= 0) |
| close(fd_userns); |
| close(fd_socket[0]); |
| close(fd_socket[1]); |
| return fret; |
| } |
| |
| static int write_file(const char *path, const char *val) |
| { |
| int fd = open(path, O_WRONLY); |
| size_t len = strlen(val); |
| int ret; |
| |
| if (fd == -1) { |
| ksft_print_msg("opening %s for write: %s\n", path, strerror(errno)); |
| return -1; |
| } |
| |
| ret = write(fd, val, len); |
| if (ret == -1) { |
| ksft_print_msg("writing to %s: %s\n", path, strerror(errno)); |
| return -1; |
| } |
| if (ret != len) { |
| ksft_print_msg("short write to %s\n", path); |
| return -1; |
| } |
| |
| ret = close(fd); |
| if (ret == -1) { |
| ksft_print_msg("closing %s\n", path); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| int setup_userns(void) |
| { |
| int ret; |
| char buf[32]; |
| uid_t uid = getuid(); |
| gid_t gid = getgid(); |
| |
| ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); |
| if (ret) { |
| ksft_exit_fail_msg("unsharing mountns and userns: %s\n", |
| strerror(errno)); |
| return ret; |
| } |
| |
| sprintf(buf, "0 %d 1", uid); |
| ret = write_file("/proc/self/uid_map", buf); |
| if (ret) |
| return ret; |
| ret = write_file("/proc/self/setgroups", "deny"); |
| if (ret) |
| return ret; |
| sprintf(buf, "0 %d 1", gid); |
| ret = write_file("/proc/self/gid_map", buf); |
| if (ret) |
| return ret; |
| |
| ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); |
| if (ret) { |
| ksft_print_msg("making mount tree private: %s\n", strerror(errno)); |
| return ret; |
| } |
| |
| return 0; |
| } |
| |
| /* caps_down - lower all effective caps */ |
| int caps_down(void) |
| { |
| bool fret = false; |
| cap_t caps = NULL; |
| int ret = -1; |
| |
| caps = cap_get_proc(); |
| if (!caps) |
| goto out; |
| |
| ret = cap_clear_flag(caps, CAP_EFFECTIVE); |
| if (ret) |
| goto out; |
| |
| ret = cap_set_proc(caps); |
| if (ret) |
| goto out; |
| |
| fret = true; |
| |
| out: |
| cap_free(caps); |
| return fret; |
| } |
| |
| /* cap_down - lower an effective cap */ |
| int cap_down(cap_value_t down) |
| { |
| bool fret = false; |
| cap_t caps = NULL; |
| cap_value_t cap = down; |
| int ret = -1; |
| |
| caps = cap_get_proc(); |
| if (!caps) |
| goto out; |
| |
| ret = cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap, 0); |
| if (ret) |
| goto out; |
| |
| ret = cap_set_proc(caps); |
| if (ret) |
| goto out; |
| |
| fret = true; |
| |
| out: |
| cap_free(caps); |
| return fret; |
| } |
| |
| uint64_t get_unique_mnt_id(const char *path) |
| { |
| struct statx sx; |
| int ret; |
| |
| ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx); |
| if (ret == -1) { |
| ksft_print_msg("retrieving unique mount ID for %s: %s\n", path, |
| strerror(errno)); |
| return 0; |
| } |
| |
| if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) { |
| ksft_print_msg("no unique mount ID available for %s\n", path); |
| return 0; |
| } |
| |
| return sx.stx_mnt_id; |
| } |