| /* SPDX-License-Identifier: GPL-2.0-only */ |
| /* |
| * Benchmarking code execution time inside the kernel |
| * |
| * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer |
| * for licensing details see kernel-base/COPYING |
| */ |
| #ifndef _LINUX_TIME_BENCH_H |
| #define _LINUX_TIME_BENCH_H |
| |
| /* Main structure used for recording a benchmark run */ |
| struct time_bench_record { |
| uint32_t version_abi; |
| uint32_t loops; /* Requested loop invocations */ |
| uint32_t step; /* option for e.g. bulk invocations */ |
| |
| uint32_t flags; /* Measurements types enabled */ |
| #define TIME_BENCH_LOOP BIT(0) |
| #define TIME_BENCH_TSC BIT(1) |
| #define TIME_BENCH_WALLCLOCK BIT(2) |
| #define TIME_BENCH_PMU BIT(3) |
| |
| uint32_t cpu; /* Used when embedded in time_bench_cpu */ |
| |
| /* Records */ |
| uint64_t invoked_cnt; /* Returned actual invocations */ |
| uint64_t tsc_start; |
| uint64_t tsc_stop; |
| struct timespec64 ts_start; |
| struct timespec64 ts_stop; |
| /* PMU counters for instruction and cycles |
| * instructions counter including pipelined instructions |
| */ |
| uint64_t pmc_inst_start; |
| uint64_t pmc_inst_stop; |
| /* CPU unhalted clock counter */ |
| uint64_t pmc_clk_start; |
| uint64_t pmc_clk_stop; |
| |
| /* Result records */ |
| uint64_t tsc_interval; |
| uint64_t time_start, time_stop, time_interval; /* in nanosec */ |
| uint64_t pmc_inst, pmc_clk; |
| |
| /* Derived result records */ |
| uint64_t tsc_cycles; // +decimal? |
| uint64_t ns_per_call_quotient, ns_per_call_decimal; |
| uint64_t time_sec; |
| uint32_t time_sec_remainder; |
| uint64_t pmc_ipc_quotient, pmc_ipc_decimal; /* inst per cycle */ |
| }; |
| |
| /* For synchronizing parallel CPUs to run concurrently */ |
| struct time_bench_sync { |
| atomic_t nr_tests_running; |
| struct completion start_event; |
| }; |
| |
| /* Keep track of CPUs executing our bench function. |
| * |
| * Embed a time_bench_record for storing info per cpu |
| */ |
| struct time_bench_cpu { |
| struct time_bench_record rec; |
| struct time_bench_sync *sync; /* back ptr */ |
| struct task_struct *task; |
| /* "data" opaque could have been placed in time_bench_sync, |
| * but to avoid any false sharing, place it per CPU |
| */ |
| void *data; |
| /* Support masking outsome CPUs, mark if it ran */ |
| bool did_bench_run; |
| /* int cpu; // note CPU stored in time_bench_record */ |
| int (*bench_func)(struct time_bench_record *record, void *data); |
| }; |
| |
| /* |
| * Below TSC assembler code is not compatible with other archs, and |
| * can also fail on guests if cpu-flags are not correct. |
| * |
| * The way TSC reading is used, many iterations, does not require as |
| * high accuracy as described below (in Intel Doc #324264). |
| * |
| * Considering changing to use get_cycles() (#include <asm/timex.h>). |
| */ |
| |
| /** TSC (Time-Stamp Counter) based ** |
| * Recommend reading, to understand details of reading TSC accurately: |
| * Intel Doc #324264, "How to Benchmark Code Execution Times on Intel" |
| * |
| * Consider getting exclusive ownership of CPU by using: |
| * unsigned long flags; |
| * preempt_disable(); |
| * raw_local_irq_save(flags); |
| * _your_code_ |
| * raw_local_irq_restore(flags); |
| * preempt_enable(); |
| * |
| * Clobbered registers: "%rax", "%rbx", "%rcx", "%rdx" |
| * RDTSC only change "%rax" and "%rdx" but |
| * CPUID clears the high 32-bits of all (rax/rbx/rcx/rdx) |
| */ |
| static __always_inline uint64_t tsc_start_clock(void) |
| { |
| /* See: Intel Doc #324264 */ |
| unsigned int hi, lo; |
| |
| asm volatile("CPUID\n\t" |
| "RDTSC\n\t" |
| "mov %%edx, %0\n\t" |
| "mov %%eax, %1\n\t" |
| : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); |
| //FIXME: on 32bit use clobbered %eax + %edx |
| return ((uint64_t)lo) | (((uint64_t)hi) << 32); |
| } |
| |
| static __always_inline uint64_t tsc_stop_clock(void) |
| { |
| /* See: Intel Doc #324264 */ |
| unsigned int hi, lo; |
| |
| asm volatile("RDTSCP\n\t" |
| "mov %%edx, %0\n\t" |
| "mov %%eax, %1\n\t" |
| "CPUID\n\t" |
| : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); |
| return ((uint64_t)lo) | (((uint64_t)hi) << 32); |
| } |
| |
| /** Wall-clock based ** |
| * |
| * use: getnstimeofday() |
| * getnstimeofday(&rec->ts_start); |
| * getnstimeofday(&rec->ts_stop); |
| * |
| * API changed see: Documentation/core-api/timekeeping.rst |
| * https://www.kernel.org/doc/html/latest/core-api/timekeeping.html#c.getnstimeofday |
| * |
| * We should instead use: ktime_get_real_ts64() is a direct |
| * replacement, but consider using monotonic time (ktime_get_ts64()) |
| * and/or a ktime_t based interface (ktime_get()/ktime_get_real()). |
| */ |
| |
| /** PMU (Performance Monitor Unit) based ** |
| * |
| * Needed for calculating: Instructions Per Cycle (IPC) |
| * - The IPC number tell how efficient the CPU pipelining were |
| */ |
| //lookup: perf_event_create_kernel_counter() |
| |
| bool time_bench_PMU_config(bool enable); |
| |
| /* Raw reading via rdpmc() using fixed counters |
| * |
| * From: https://github.com/andikleen/simple-pmu |
| */ |
| enum { |
| FIXED_SELECT = (1U << 30), /* == 0x40000000 */ |
| FIXED_INST_RETIRED_ANY = 0, |
| FIXED_CPU_CLK_UNHALTED_CORE = 1, |
| FIXED_CPU_CLK_UNHALTED_REF = 2, |
| }; |
| |
| static __always_inline unsigned int long long p_rdpmc(unsigned int in) |
| { |
| unsigned int d, a; |
| |
| asm volatile("rdpmc" : "=d"(d), "=a"(a) : "c"(in) : "memory"); |
| return ((unsigned long long)d << 32) | a; |
| } |
| |
| /* These PMU counter needs to be enabled, but I don't have the |
| * configure code implemented. My current hack is running: |
| * sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko |
| */ |
| /* Reading all pipelined instruction */ |
| static __always_inline unsigned long long pmc_inst(void) |
| { |
| return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY); |
| } |
| |
| /* Reading CPU clock cycles */ |
| static __always_inline unsigned long long pmc_clk(void) |
| { |
| return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE); |
| } |
| |
| /* Raw reading via MSR rdmsr() is likely wrong |
| * FIXME: How can I know which raw MSR registers are conf for what? |
| */ |
| #define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */ |
| #define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */ |
| #define MSR_IA32_PCM2 0x400000C3 |
| static inline uint64_t msr_inst(unsigned long long *msr_result) |
| { |
| return rdmsrq_safe(MSR_IA32_PCM0, msr_result); |
| } |
| |
| /** Generic functions ** |
| */ |
| bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, |
| int (*func)(struct time_bench_record *rec, void *data)); |
| bool time_bench_calc_stats(struct time_bench_record *rec); |
| |
| void time_bench_run_concurrent(uint32_t loops, int step, void *data, |
| const struct cpumask *mask, /* Support masking outsome CPUs*/ |
| struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, |
| int (*func)(struct time_bench_record *record, void *data)); |
| void time_bench_print_stats_cpumask(const char *desc, |
| struct time_bench_cpu *cpu_tasks, |
| const struct cpumask *mask); |
| |
| //FIXME: use rec->flags to select measurement, should be MACRO |
| static __always_inline void time_bench_start(struct time_bench_record *rec) |
| { |
| //getnstimeofday(&rec->ts_start); |
| ktime_get_real_ts64(&rec->ts_start); |
| if (rec->flags & TIME_BENCH_PMU) { |
| rec->pmc_inst_start = pmc_inst(); |
| rec->pmc_clk_start = pmc_clk(); |
| } |
| rec->tsc_start = tsc_start_clock(); |
| } |
| |
| static __always_inline void time_bench_stop(struct time_bench_record *rec, |
| uint64_t invoked_cnt) |
| { |
| rec->tsc_stop = tsc_stop_clock(); |
| if (rec->flags & TIME_BENCH_PMU) { |
| rec->pmc_inst_stop = pmc_inst(); |
| rec->pmc_clk_stop = pmc_clk(); |
| } |
| //getnstimeofday(&rec->ts_stop); |
| ktime_get_real_ts64(&rec->ts_stop); |
| rec->invoked_cnt = invoked_cnt; |
| } |
| |
| #endif /* _LINUX_TIME_BENCH_H */ |