|  | // SPDX-License-Identifier: GPL-2.0-or-later | 
|  | /* | 
|  | * Debugging module statistics. | 
|  | * | 
|  | * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> | 
|  | */ | 
|  |  | 
|  | #include <linux/module.h> | 
|  | #include <uapi/linux/module.h> | 
|  | #include <linux/string.h> | 
|  | #include <linux/printk.h> | 
|  | #include <linux/slab.h> | 
|  | #include <linux/list.h> | 
|  | #include <linux/debugfs.h> | 
|  | #include <linux/rculist.h> | 
|  | #include <linux/math.h> | 
|  |  | 
|  | #include "internal.h" | 
|  |  | 
|  | /** | 
|  | * DOC: module debugging statistics overview | 
|  | * | 
|  | * Enabling CONFIG_MODULE_STATS enables module debugging statistics which | 
|  | * are useful to monitor and root cause memory pressure issues with module | 
|  | * loading. These statistics are useful to allow us to improve production | 
|  | * workloads. | 
|  | * | 
|  | * The current module debugging statistics supported help keep track of module | 
|  | * loading failures to enable improvements either for kernel module auto-loading | 
|  | * usage (request_module()) or interactions with userspace. Statistics are | 
|  | * provided to track all possible failures in the finit_module() path and memory | 
|  | * wasted in this process space.  Each of the failure counters are associated | 
|  | * to a type of module loading failure which is known to incur a certain amount | 
|  | * of memory allocation loss. In the worst case loading a module will fail after | 
|  | * a 3 step memory allocation process: | 
|  | * | 
|  | *   a) memory allocated with kernel_read_file_from_fd() | 
|  | *   b) module decompression processes the file read from | 
|  | *      kernel_read_file_from_fd(), and vmap() is used to map | 
|  | *      the decompressed module to a new local buffer which represents | 
|  | *      a copy of the decompressed module passed from userspace. The buffer | 
|  | *      from kernel_read_file_from_fd() is freed right away. | 
|  | *   c) layout_and_allocate() allocates space for the final resting | 
|  | *      place where we would keep the module if it were to be processed | 
|  | *      successfully. | 
|  | * | 
|  | * If a failure occurs after these three different allocations only one | 
|  | * counter will be incremented with the summation of the allocated bytes freed | 
|  | * incurred during this failure. Likewise, if module loading failed only after | 
|  | * step b) a separate counter is used and incremented for the bytes freed and | 
|  | * not used during both of those allocations. | 
|  | * | 
|  | * Virtual memory space can be limited, for example on x86 virtual memory size | 
|  | * defaults to 128 MiB. We should strive to limit and avoid wasting virtual | 
|  | * memory allocations when possible. These module debugging statistics help | 
|  | * to evaluate how much memory is being wasted on bootup due to module loading | 
|  | * failures. | 
|  | * | 
|  | * All counters are designed to be incremental. Atomic counters are used so to | 
|  | * remain simple and avoid delays and deadlocks. | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * DOC: dup_failed_modules - tracks duplicate failed modules | 
|  | * | 
|  | * Linked list of modules which failed to be loaded because an already existing | 
|  | * module with the same name was already being processed or already loaded. | 
|  | * The finit_module() system call incurs heavy virtual memory allocations. In | 
|  | * the worst case an finit_module() system call can end up allocating virtual | 
|  | * memory 3 times: | 
|  | * | 
|  | *   1) kernel_read_file_from_fd() call uses vmalloc() | 
|  | *   2) optional module decompression uses vmap() | 
|  | *   3) layout_and allocate() can use vzalloc() or an arch specific variation of | 
|  | *      vmalloc to deal with ELF sections requiring special permissions | 
|  | * | 
|  | * In practice on a typical boot today most finit_module() calls fail due to | 
|  | * the module with the same name already being loaded or about to be processed. | 
|  | * All virtual memory allocated to these failed modules will be freed with | 
|  | * no functional use. | 
|  | * | 
|  | * To help with this the dup_failed_modules allows us to track modules which | 
|  | * failed to load due to the fact that a module was already loaded or being | 
|  | * processed.  There are only two points at which we can fail such calls, | 
|  | * we list them below along with the number of virtual memory allocation | 
|  | * calls: | 
|  | * | 
|  | *   a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before | 
|  | *	layout_and_allocate(). | 
|  | *	- with module decompression: 2 virtual memory allocation calls | 
|  | *	- without module decompression: 1 virtual memory allocation calls | 
|  | *   b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() | 
|  | *   	- with module decompression 3 virtual memory allocation calls | 
|  | *   	- without module decompression 2 virtual memory allocation calls | 
|  | * | 
|  | * We should strive to get this list to be as small as possible. If this list | 
|  | * is not empty it is a reflection of possible work or optimizations possible | 
|  | * either in-kernel or in userspace. | 
|  | */ | 
|  | static LIST_HEAD(dup_failed_modules); | 
|  |  | 
|  | /** | 
|  | * DOC: module statistics debugfs counters | 
|  | * | 
|  | * The total amount of wasted virtual memory allocation space during module | 
|  | * loading can be computed by adding the total from the summation: | 
|  | * | 
|  | *   * @invalid_kread_bytes + | 
|  | *     @invalid_decompress_bytes + | 
|  | *     @invalid_becoming_bytes + | 
|  | *     @invalid_mod_bytes | 
|  | * | 
|  | * The following debugfs counters are available to inspect module loading | 
|  | * failures: | 
|  | * | 
|  | *   * total_mod_size: total bytes ever used by all modules we've dealt with on | 
|  | *     this system | 
|  | *   * total_text_size: total bytes of the .text and .init.text ELF section | 
|  | *     sizes we've dealt with on this system | 
|  | *   * invalid_kread_bytes: bytes allocated and then freed on failures which | 
|  | *     happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() | 
|  | *     uses vmalloc(). These should typically not happen unless your system is | 
|  | *     under memory pressure. | 
|  | *   * invalid_decompress_bytes: number of bytes allocated and freed due to | 
|  | *     memory allocations in the module decompression path that use vmap(). | 
|  | *     These typically should not happen unless your system is under memory | 
|  | *     pressure. | 
|  | *   * invalid_becoming_bytes: total number of bytes allocated and freed used | 
|  | *     to read the kernel module userspace wants us to read before we | 
|  | *     promote it to be processed to be added to our @modules linked list. These | 
|  | *     failures can happen if we had a check in between a successful kernel_read_file_from_fd() | 
|  | *     call and right before we allocate the our private memory for the module | 
|  | *     which would be kept if the module is successfully loaded. The most common | 
|  | *     reason for this failure is when userspace is racing to load a module | 
|  | *     which it does not yet see loaded. The first module to succeed in | 
|  | *     add_unformed_module() will add a module to our &modules list and | 
|  | *     subsequent loads of modules with the same name will error out at the | 
|  | *     end of early_mod_check(). The check for module_patient_check_exists() | 
|  | *     at the end of early_mod_check() prevents duplicate allocations | 
|  | *     on layout_and_allocate() for modules already being processed. These | 
|  | *     duplicate failed modules are non-fatal, however they typically are | 
|  | *     indicative of userspace not seeing a module in userspace loaded yet and | 
|  | *     unnecessarily trying to load a module before the kernel even has a chance | 
|  | *     to begin to process prior requests. Although duplicate failures can be | 
|  | *     non-fatal, we should try to reduce vmalloc() pressure proactively, so | 
|  | *     ideally after boot this will be close to as 0 as possible.  If module | 
|  | *     decompression was used we also add to this counter the cost of the | 
|  | *     initial kernel_read_file_from_fd() of the compressed module. If module | 
|  | *     decompression was not used the value represents the total allocated and | 
|  | *     freed bytes in kernel_read_file_from_fd() calls for these type of | 
|  | *     failures. These failures can occur because: | 
|  | * | 
|  | *    * module_sig_check() - module signature checks | 
|  | *    * elf_validity_cache_copy() - some ELF validation issue | 
|  | *    * early_mod_check(): | 
|  | * | 
|  | *      * blacklisting | 
|  | *      * failed to rewrite section headers | 
|  | *      * version magic | 
|  | *      * live patch requirements didn't check out | 
|  | *      * the module was detected as being already present | 
|  | * | 
|  | *   * invalid_mod_bytes: these are the total number of bytes allocated and | 
|  | *     freed due to failures after we did all the sanity checks of the module | 
|  | *     which userspace passed to us and after our first check that the module | 
|  | *     is unique.  A module can still fail to load if we detect the module is | 
|  | *     loaded after we allocate space for it with layout_and_allocate(), we do | 
|  | *     this check right before processing the module as live and run its | 
|  | *     initialization routines. Note that you have a failure of this type it | 
|  | *     also means the respective kernel_read_file_from_fd() memory space was | 
|  | *     also freed and not used, and so we increment this counter with twice | 
|  | *     the size of the module. Additionally if you used module decompression | 
|  | *     the size of the compressed module is also added to this counter. | 
|  | * | 
|  | *  * modcount: how many modules we've loaded in our kernel life time | 
|  | *  * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() | 
|  | *  * failed_decompress: how many failed module decompression attempts we've had. | 
|  | *    These really should not happen unless your compression / decompression | 
|  | *    might be broken. | 
|  | *  * failed_becoming: how many modules failed after we kernel_read_file_from_fd() | 
|  | *    it and before we allocate memory for it with layout_and_allocate(). This | 
|  | *    counter is never incremented if you manage to validate the module and | 
|  | *    call layout_and_allocate() for it. | 
|  | *  * failed_load_modules: how many modules failed once we've allocated our | 
|  | *    private space for our module using layout_and_allocate(). These failures | 
|  | *    should hopefully mostly be dealt with already. Races in theory could | 
|  | *    still exist here, but it would just mean the kernel had started processing | 
|  | *    two threads concurrently up to early_mod_check() and one thread won. | 
|  | *    These failures are good signs the kernel or userspace is doing something | 
|  | *    seriously stupid or that could be improved. We should strive to fix these, | 
|  | *    but it is perhaps not easy to fix them. A recent example are the modules | 
|  | *    requests incurred for frequency modules, a separate module request was | 
|  | *    being issued for each CPU on a system. | 
|  | */ | 
|  |  | 
|  | atomic_long_t total_mod_size; | 
|  | atomic_long_t total_text_size; | 
|  | atomic_long_t invalid_kread_bytes; | 
|  | atomic_long_t invalid_decompress_bytes; | 
|  | static atomic_long_t invalid_becoming_bytes; | 
|  | static atomic_long_t invalid_mod_bytes; | 
|  | atomic_t modcount; | 
|  | atomic_t failed_kreads; | 
|  | atomic_t failed_decompress; | 
|  | static atomic_t failed_becoming; | 
|  | static atomic_t failed_load_modules; | 
|  |  | 
|  | static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) | 
|  | { | 
|  | if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && | 
|  | test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) | 
|  | return "Becoming & Load"; | 
|  | if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) | 
|  | return "Becoming"; | 
|  | if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) | 
|  | return "Load"; | 
|  | return "Bug-on-stats"; | 
|  | } | 
|  |  | 
|  | void mod_stat_bump_invalid(struct load_info *info, int flags) | 
|  | { | 
|  | atomic_long_add(info->len * 2, &invalid_mod_bytes); | 
|  | atomic_inc(&failed_load_modules); | 
|  | #if defined(CONFIG_MODULE_DECOMPRESS) | 
|  | if (flags & MODULE_INIT_COMPRESSED_FILE) | 
|  | atomic_long_add(info->compressed_len, &invalid_mod_bytes); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | void mod_stat_bump_becoming(struct load_info *info, int flags) | 
|  | { | 
|  | atomic_inc(&failed_becoming); | 
|  | atomic_long_add(info->len, &invalid_becoming_bytes); | 
|  | #if defined(CONFIG_MODULE_DECOMPRESS) | 
|  | if (flags & MODULE_INIT_COMPRESSED_FILE) | 
|  | atomic_long_add(info->compressed_len, &invalid_becoming_bytes); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) | 
|  | { | 
|  | struct mod_fail_load *mod_fail; | 
|  |  | 
|  | list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, | 
|  | lockdep_is_held(&module_mutex)) { | 
|  | if (!strcmp(mod_fail->name, name)) { | 
|  | atomic_long_inc(&mod_fail->count); | 
|  | __set_bit(reason, &mod_fail->dup_fail_mask); | 
|  | goto out; | 
|  | } | 
|  | } | 
|  |  | 
|  | mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL); | 
|  | if (!mod_fail) | 
|  | return -ENOMEM; | 
|  | memcpy(mod_fail->name, name, strlen(name)); | 
|  | __set_bit(reason, &mod_fail->dup_fail_mask); | 
|  | atomic_long_inc(&mod_fail->count); | 
|  | list_add_rcu(&mod_fail->list, &dup_failed_modules); | 
|  | out: | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the | 
|  | * 112 module prints within 8k. | 
|  | * | 
|  | * 1024 + (64*112) = 8k | 
|  | */ | 
|  | #define MAX_PREAMBLE 1024 | 
|  | #define MAX_FAILED_MOD_PRINT 112 | 
|  | #define MAX_BYTES_PER_MOD 64 | 
|  | static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, | 
|  | size_t count, loff_t *ppos) | 
|  | { | 
|  | struct mod_fail_load *mod_fail; | 
|  | unsigned int len, size, count_failed = 0; | 
|  | char *buf; | 
|  | int ret; | 
|  | u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; | 
|  | unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, | 
|  | idecompress_bytes, imod_bytes, total_virtual_lost; | 
|  |  | 
|  | live_mod_count = atomic_read(&modcount); | 
|  | fkreads = atomic_read(&failed_kreads); | 
|  | fdecompress = atomic_read(&failed_decompress); | 
|  | fbecoming = atomic_read(&failed_becoming); | 
|  | floads = atomic_read(&failed_load_modules); | 
|  |  | 
|  | total_size = atomic_long_read(&total_mod_size); | 
|  | text_size = atomic_long_read(&total_text_size); | 
|  | ikread_bytes = atomic_long_read(&invalid_kread_bytes); | 
|  | idecompress_bytes = atomic_long_read(&invalid_decompress_bytes); | 
|  | ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes); | 
|  | imod_bytes = atomic_long_read(&invalid_mod_bytes); | 
|  |  | 
|  | total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; | 
|  |  | 
|  | size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), | 
|  | (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; | 
|  | buf = kzalloc(size, GFP_KERNEL); | 
|  | if (buf == NULL) | 
|  | return -ENOMEM; | 
|  |  | 
|  | /* The beginning of our debug preamble */ | 
|  | len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress", | 
|  | fdecompress); | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size); | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes", | 
|  | idecompress_bytes); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes); | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost); | 
|  |  | 
|  | if (live_mod_count && total_size) { | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size", | 
|  | DIV_ROUND_UP(total_size, live_mod_count)); | 
|  | } | 
|  |  | 
|  | if (live_mod_count && text_size) { | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size", | 
|  | DIV_ROUND_UP(text_size, live_mod_count)); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We use WARN_ON_ONCE() for the counters to ensure we always have parity | 
|  | * for keeping tabs on a type of failure with one type of byte counter. | 
|  | * The counters for imod_bytes does not increase for fkreads failures | 
|  | * for example, and so on. | 
|  | */ | 
|  |  | 
|  | WARN_ON_ONCE(ikread_bytes && !fkreads); | 
|  | if (fkreads && ikread_bytes) { | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes", | 
|  | DIV_ROUND_UP(ikread_bytes, fkreads)); | 
|  | } | 
|  |  | 
|  | WARN_ON_ONCE(ibecoming_bytes && !fbecoming); | 
|  | if (fbecoming && ibecoming_bytes) { | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes", | 
|  | DIV_ROUND_UP(ibecoming_bytes, fbecoming)); | 
|  | } | 
|  |  | 
|  | WARN_ON_ONCE(idecompress_bytes && !fdecompress); | 
|  | if (fdecompress && idecompress_bytes) { | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes", | 
|  | DIV_ROUND_UP(idecompress_bytes, fdecompress)); | 
|  | } | 
|  |  | 
|  | WARN_ON_ONCE(imod_bytes && !floads); | 
|  | if (floads && imod_bytes) { | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes", | 
|  | DIV_ROUND_UP(imod_bytes, floads)); | 
|  | } | 
|  |  | 
|  | /* End of our debug preamble header. */ | 
|  |  | 
|  | /* Catch when we've gone beyond our expected preamble */ | 
|  | WARN_ON_ONCE(len >= MAX_PREAMBLE); | 
|  |  | 
|  | if (list_empty(&dup_failed_modules)) | 
|  | goto out; | 
|  |  | 
|  | len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n"); | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n", | 
|  | "Module-name", "How-many-times", "Reason"); | 
|  | mutex_lock(&module_mutex); | 
|  |  | 
|  |  | 
|  | list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { | 
|  | if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) | 
|  | goto out_unlock; | 
|  | len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name, | 
|  | atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail)); | 
|  | } | 
|  | out_unlock: | 
|  | mutex_unlock(&module_mutex); | 
|  | out: | 
|  | ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); | 
|  | kfree(buf); | 
|  | return ret; | 
|  | } | 
|  | #undef MAX_PREAMBLE | 
|  | #undef MAX_FAILED_MOD_PRINT | 
|  | #undef MAX_BYTES_PER_MOD | 
|  |  | 
|  | static const struct file_operations fops_mod_stats = { | 
|  | .read = read_file_mod_stats, | 
|  | .open = simple_open, | 
|  | .owner = THIS_MODULE, | 
|  | .llseek = default_llseek, | 
|  | }; | 
|  |  | 
|  | #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) | 
|  | #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) | 
|  | static int __init module_stats_init(void) | 
|  | { | 
|  | mod_debug_add_ulong(total_mod_size); | 
|  | mod_debug_add_ulong(total_text_size); | 
|  | mod_debug_add_ulong(invalid_kread_bytes); | 
|  | mod_debug_add_ulong(invalid_decompress_bytes); | 
|  | mod_debug_add_ulong(invalid_becoming_bytes); | 
|  | mod_debug_add_ulong(invalid_mod_bytes); | 
|  |  | 
|  | mod_debug_add_atomic(modcount); | 
|  | mod_debug_add_atomic(failed_kreads); | 
|  | mod_debug_add_atomic(failed_decompress); | 
|  | mod_debug_add_atomic(failed_becoming); | 
|  | mod_debug_add_atomic(failed_load_modules); | 
|  |  | 
|  | debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  | #undef mod_debug_add_ulong | 
|  | #undef mod_debug_add_atomic | 
|  | module_init(module_stats_init); |