tools/testing/selftests/net/bench/page_pool/time_bench.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Benchmarking code execution time inside the kernel
  *
  * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

 #include <linux/module.h>
 #include <linux/time.h>

 #include <linux/perf_event.h> /* perf_event_create_kernel_counter() */

 /* For concurrency testing */
 #include <linux/completion.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>

 #include "time_bench.h"

 static int verbose = 1;

 /** TSC (Time-Stamp Counter) based **
  * See: linux/time_bench.h
  *  tsc_start_clock() and tsc_stop_clock()
  */

 /** Wall-clock based **
  */

 /** PMU (Performance Monitor Unit) based **
  */
 #define PERF_FORMAT                                                            \
 	(PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \
 	 PERF_FORMAT_TOTAL_TIME_RUNNING)

 struct raw_perf_event {
 	uint64_t config; /* event */
 	uint64_t config1; /* umask */
 	struct perf_event *save;
 	char *desc;
 };

 /* if HT is enable a maximum of 4 events (5 if one is instructions
  * retired can be specified, if HT is disabled a maximum of 8 (9 if
  * one is instructions retired) can be specified.
  *
  * From Table 19-1. Architectural Performance Events
  * Architectures Software Developer’s Manual Volume 3: System Programming
  * Guide
  */
 struct raw_perf_event perf_events[] = {
 	{ 0x3c, 0x00, NULL, "Unhalted CPU Cycles" },
 	{ 0xc0, 0x00, NULL, "Instruction Retired" }
 };

 #define NUM_EVTS (ARRAY_SIZE(perf_events))

 /* WARNING: PMU config is currently broken!
  */
 bool time_bench_PMU_config(bool enable)
 {
 	int i;
 	struct perf_event_attr perf_conf;
 	struct perf_event *perf_event;
 	int cpu;

 	preempt_disable();
 	cpu = smp_processor_id();
 	pr_info("DEBUG: cpu:%d\n", cpu);
 	preempt_enable();

 	memset(&perf_conf, 0, sizeof(struct perf_event_attr));
 	perf_conf.type           = PERF_TYPE_RAW;
 	perf_conf.size           = sizeof(struct perf_event_attr);
 	perf_conf.read_format    = PERF_FORMAT;
 	perf_conf.pinned         = 1;
 	perf_conf.exclude_user   = 1; /* No userspace events */
 	perf_conf.exclude_kernel = 0; /* Only kernel events */

 	for (i = 0; i < NUM_EVTS; i++) {
 		perf_conf.disabled = enable;
 		//perf_conf.disabled = (i == 0) ? 1 : 0;
 		perf_conf.config   = perf_events[i].config;
 		perf_conf.config1  = perf_events[i].config1;
 		if (verbose)
 			pr_info("%s() enable PMU counter: %s\n",
 				__func__, perf_events[i].desc);
 		perf_event = perf_event_create_kernel_counter(&perf_conf, cpu,
 							      NULL /* task */,
 							      NULL /* overflow_handler*/,
 							      NULL /* context */);
 		if (perf_event) {
 			perf_events[i].save = perf_event;
 			pr_info("%s():DEBUG perf_event success\n", __func__);

 			perf_event_enable(perf_event);
 		} else {
 			pr_info("%s():DEBUG perf_event is NULL\n", __func__);
 		}
 	}

 	return true;
 }

 /** Generic functions **
  */

 /* Calculate stats, store results in record */
 bool time_bench_calc_stats(struct time_bench_record *rec)
 {
 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
 	uint64_t ns_per_call_tmp_rem = 0;
 	uint32_t ns_per_call_remainder = 0;
 	uint64_t pmc_ipc_tmp_rem = 0;
 	uint32_t pmc_ipc_remainder = 0;
 	uint32_t pmc_ipc_div = 0;
 	uint32_t invoked_cnt_precision = 0;
 	uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */

 	if (rec->flags & TIME_BENCH_LOOP) {
 		if (rec->invoked_cnt < 1000) {
 			pr_err("ERR: need more(>1000) loops(%llu) for timing\n",
 			       rec->invoked_cnt);
 			return false;
 		}
 		if (rec->invoked_cnt > ((1ULL << 32) - 1)) {
 			/* div_u64_rem() can only support div with 32bit*/
 			pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n",
 			       rec->invoked_cnt);
 			return false;
 		}
 		invoked_cnt = (uint32_t)rec->invoked_cnt;
 	}

 	/* TSC (Time-Stamp Counter) records */
 	if (rec->flags & TIME_BENCH_TSC) {
 		rec->tsc_interval = rec->tsc_stop - rec->tsc_start;
 		if (rec->tsc_interval == 0) {
 			pr_err("ABORT: timing took ZERO TSC time\n");
 			return false;
 		}
 		/* Calculate stats */
 		if (rec->flags & TIME_BENCH_LOOP)
 			rec->tsc_cycles = rec->tsc_interval / invoked_cnt;
 		else
 			rec->tsc_cycles = rec->tsc_interval;
 	}

 	/* Wall-clock time calc */
 	if (rec->flags & TIME_BENCH_WALLCLOCK) {
 		rec->time_start = rec->ts_start.tv_nsec +
 				  (NANOSEC_PER_SEC * rec->ts_start.tv_sec);
 		rec->time_stop = rec->ts_stop.tv_nsec +
 				 (NANOSEC_PER_SEC * rec->ts_stop.tv_sec);
 		rec->time_interval = rec->time_stop - rec->time_start;
 		if (rec->time_interval == 0) {
 			pr_err("ABORT: timing took ZERO wallclock time\n");
 			return false;
 		}
 		/* Calculate stats */
 		/*** Division in kernel it tricky ***/
 		/* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */
 		/* remainder only correct because NANOSEC_PER_SEC is 10^9 */
 		rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC,
 					    &rec->time_sec_remainder);
 		//TODO: use existing struct timespec records instead of div?

 		if (rec->flags & TIME_BENCH_LOOP) {
 			/*** Division in kernel it tricky ***/
 			/* Orig: ns = ((double)time_interval / invoked_cnt); */
 			/* First get quotient */
 			rec->ns_per_call_quotient =
 				div_u64_rem(rec->time_interval, invoked_cnt,
 					    &ns_per_call_remainder);
 			/* Now get decimals .xxx precision (incorrect roundup)*/
 			ns_per_call_tmp_rem = ns_per_call_remainder;
 			invoked_cnt_precision = invoked_cnt / 1000;
 			if (invoked_cnt_precision > 0) {
 				rec->ns_per_call_decimal =
 					div_u64_rem(ns_per_call_tmp_rem,
 						    invoked_cnt_precision,
 						    &ns_per_call_remainder);
 			}
 		}
 	}

 	/* Performance Monitor Unit (PMU) counters */
 	if (rec->flags & TIME_BENCH_PMU) {
 		//FIXME: Overflow handling???
 		rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start;
 		rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start;

 		/* Calc Instruction Per Cycle (IPC) */
 		/* First get quotient */
 		rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk,
 						    &pmc_ipc_remainder);
 		/* Now get decimals .xxx precision (incorrect roundup)*/
 		pmc_ipc_tmp_rem = pmc_ipc_remainder;
 		pmc_ipc_div = rec->pmc_clk / 1000;
 		if (pmc_ipc_div > 0) {
 			rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem,
 							   pmc_ipc_div,
 							   &pmc_ipc_remainder);
 		}
 	}

 	return true;
 }

 /* Generic function for invoking a loop function and calculating
  * execution time stats.  The function being called/timed is assumed
  * to perform a tight loop, and update the timing record struct.
  */
 bool time_bench_loop(uint32_t loops, int step, char *txt, void *data,
 		     int (*func)(struct time_bench_record *record, void *data))
 {
 	struct time_bench_record rec;

 	/* Setup record */
 	memset(&rec, 0, sizeof(rec)); /* zero func might not update all */
 	rec.version_abi = 1;
 	rec.loops       = loops;
 	rec.step        = step;
 	rec.flags       = (TIME_BENCH_LOOP | TIME_BENCH_TSC | TIME_BENCH_WALLCLOCK);

 	/*** Loop function being timed ***/
 	if (!func(&rec, data)) {
 		pr_err("ABORT: function being timed failed\n");
 		return false;
 	}

 	if (rec.invoked_cnt < loops)
 		pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n",
 			rec.invoked_cnt, loops);

 	/* Calculate stats */
 	time_bench_calc_stats(&rec);

 	pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n",
 		txt, rec.tsc_cycles, rec.ns_per_call_quotient,
 		rec.ns_per_call_decimal, rec.step, rec.time_sec,
 		rec.time_sec_remainder, rec.time_interval, rec.invoked_cnt,
 		rec.tsc_interval);
 	if (rec.flags & TIME_BENCH_PMU)
 		pr_info("Type:%s PMU inst/clock%llu/%llu = %llu.%03llu IPC (inst per cycle)\n",
 			txt, rec.pmc_inst, rec.pmc_clk, rec.pmc_ipc_quotient,
 			rec.pmc_ipc_decimal);
 	return true;
 }

 /* Function getting invoked by kthread */
 static int invoke_test_on_cpu_func(void *private)
 {
 	struct time_bench_cpu *cpu = private;
 	struct time_bench_sync *sync = cpu->sync;
 	cpumask_t newmask = CPU_MASK_NONE;
 	void *data = cpu->data;

 	/* Restrict CPU */
 	cpumask_set_cpu(cpu->rec.cpu, &newmask);
 	set_cpus_allowed_ptr(current, &newmask);

 	/* Synchronize start of concurrency test */
 	atomic_inc(&sync->nr_tests_running);
 	wait_for_completion(&sync->start_event);

 	/* Start benchmark function */
 	if (!cpu->bench_func(&cpu->rec, data)) {
 		pr_err("ERROR: function being timed failed on CPU:%d(%d)\n",
 		       cpu->rec.cpu, smp_processor_id());
 	} else {
 		if (verbose)
 			pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu,
 				smp_processor_id());
 	}
 	cpu->did_bench_run = true;

 	/* End test */
 	atomic_dec(&sync->nr_tests_running);
 	/*  Wait for kthread_stop() telling us to stop */
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }

 void time_bench_print_stats_cpumask(const char *desc,
 				    struct time_bench_cpu *cpu_tasks,
 				    const struct cpumask *mask)
 {
 	uint64_t average = 0;
 	int cpu;
 	int step = 0;
 	struct sum {
 		uint64_t tsc_cycles;
 		int records;
 	} sum = { 0 };

 	/* Get stats */
 	for_each_cpu(cpu, mask) {
 		struct time_bench_cpu *c = &cpu_tasks[cpu];
 		struct time_bench_record *rec = &c->rec;

 		/* Calculate stats */
 		time_bench_calc_stats(rec);

 		pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n",
 			desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient,
 			rec->ns_per_call_decimal, rec->step, rec->time_sec,
 			rec->time_sec_remainder, rec->time_interval,
 			rec->invoked_cnt, rec->tsc_interval);

 		/* Collect average */
 		sum.records++;
 		sum.tsc_cycles += rec->tsc_cycles;
 		step = rec->step;
 	}

 	if (sum.records) /* avoid div-by-zero */
 		average = sum.tsc_cycles / sum.records;
 	pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc,
 		average, sum.records, step);
 }

 void time_bench_run_concurrent(uint32_t loops, int step, void *data,
 			       const struct cpumask *mask, /* Support masking outsome CPUs*/
 			       struct time_bench_sync *sync,
 			       struct time_bench_cpu *cpu_tasks,
 			       int (*func)(struct time_bench_record *record, void *data))
 {
 	int cpu, running = 0;

 	if (verbose) // DEBUG
 		pr_warn("%s() Started on CPU:%d\n", __func__,
 			smp_processor_id());

 	/* Reset sync conditions */
 	atomic_set(&sync->nr_tests_running, 0);
 	init_completion(&sync->start_event);

 	/* Spawn off jobs on all CPUs */
 	for_each_cpu(cpu, mask) {
 		struct time_bench_cpu *c = &cpu_tasks[cpu];

 		running++;
 		c->sync = sync; /* Send sync variable along */
 		c->data = data; /* Send opaque along */

 		/* Init benchmark record */
 		memset(&c->rec, 0, sizeof(struct time_bench_record));
 		c->rec.version_abi = 1;
 		c->rec.loops       = loops;
 		c->rec.step        = step;
 		c->rec.flags       = (TIME_BENCH_LOOP | TIME_BENCH_TSC |
 				      TIME_BENCH_WALLCLOCK);
 		c->rec.cpu = cpu;
 		c->bench_func = func;
 		c->task = kthread_run(invoke_test_on_cpu_func, c,
 				      "time_bench%d", cpu);
 		if (IS_ERR(c->task)) {
 			pr_err("%s(): Failed to start test func\n", __func__);
 			return; /* Argh, what about cleanup?! */
 		}
 	}

 	/* Wait until all processes are running */
 	while (atomic_read(&sync->nr_tests_running) < running) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(10);
 	}
 	/* Kick off all CPU concurrently on completion event */
 	complete_all(&sync->start_event);

 	/* Wait for CPUs to finish */
 	while (atomic_read(&sync->nr_tests_running)) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(10);
 	}

 	/* Stop the kthreads */
 	for_each_cpu(cpu, mask) {
 		struct time_bench_cpu *c = &cpu_tasks[cpu];

 		kthread_stop(c->task);
 	}

 	if (verbose) // DEBUG - happens often, finish on another CPU
 		pr_warn("%s() Finished on CPU:%d\n", __func__,
 			smp_processor_id());
 }
	// SPDX-License-Identifier: GPL-2.0-only
	/*
	* Benchmarking code execution time inside the kernel
	*
	* Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
	*/
	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

	#include <linux/module.h>
	#include <linux/time.h>

	#include <linux/perf_event.h> /* perf_event_create_kernel_counter() */

	/* For concurrency testing */
	#include <linux/completion.h>
	#include <linux/sched.h>
	#include <linux/workqueue.h>
	#include <linux/kthread.h>

	#include "time_bench.h"

	static int verbose = 1;

	/ TSC (Time-Stamp Counter) based
	* See: linux/time_bench.h
	* tsc_start_clock() and tsc_stop_clock()
	*/

	/ Wall-clock based
	*/

	/ PMU (Performance Monitor Unit) based
	*/
	#define PERF_FORMAT \
	(PERF_FORMAT_GROUP \| PERF_FORMAT_ID \| PERF_FORMAT_TOTAL_TIME_ENABLED \| \
	PERF_FORMAT_TOTAL_TIME_RUNNING)

	struct raw_perf_event {
	uint64_t config; /* event */
	uint64_t config1; /* umask */
	struct perf_event *save;
	char *desc;
	};

	/* if HT is enable a maximum of 4 events (5 if one is instructions
	* retired can be specified, if HT is disabled a maximum of 8 (9 if
	* one is instructions retired) can be specified.
	*
	* From Table 19-1. Architectural Performance Events
	* Architectures Software Developer’s Manual Volume 3: System Programming
	* Guide
	*/
	struct raw_perf_event perf_events[] = {
	{ 0x3c, 0x00, NULL, "Unhalted CPU Cycles" },
	{ 0xc0, 0x00, NULL, "Instruction Retired" }
	};

	#define NUM_EVTS (ARRAY_SIZE(perf_events))

	/* WARNING: PMU config is currently broken!
	*/
	bool time_bench_PMU_config(bool enable)
	{
	int i;
	struct perf_event_attr perf_conf;
	struct perf_event *perf_event;
	int cpu;

	preempt_disable();
	cpu = smp_processor_id();
	pr_info("DEBUG: cpu:%d\n", cpu);
	preempt_enable();

	memset(&perf_conf, 0, sizeof(struct perf_event_attr));
	perf_conf.type = PERF_TYPE_RAW;
	perf_conf.size = sizeof(struct perf_event_attr);
	perf_conf.read_format = PERF_FORMAT;
	perf_conf.pinned = 1;
	perf_conf.exclude_user = 1; /* No userspace events */
	perf_conf.exclude_kernel = 0; /* Only kernel events */

	for (i = 0; i < NUM_EVTS; i++) {
	perf_conf.disabled = enable;
	//perf_conf.disabled = (i == 0) ? 1 : 0;
	perf_conf.config = perf_events[i].config;
	perf_conf.config1 = perf_events[i].config1;
	if (verbose)
	pr_info("%s() enable PMU counter: %s\n",
	__func__, perf_events[i].desc);
	perf_event = perf_event_create_kernel_counter(&perf_conf, cpu,
	NULL /* task */,
	NULL /* overflow_handler*/,
	NULL /* context */);
	if (perf_event) {
	perf_events[i].save = perf_event;
	pr_info("%s():DEBUG perf_event success\n", __func__);

	perf_event_enable(perf_event);
	} else {
	pr_info("%s():DEBUG perf_event is NULL\n", __func__);
	}
	}

	return true;
	}

	/ Generic functions
	*/

	/* Calculate stats, store results in record */
	bool time_bench_calc_stats(struct time_bench_record *rec)
	{
	#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
	uint64_t ns_per_call_tmp_rem = 0;
	uint32_t ns_per_call_remainder = 0;
	uint64_t pmc_ipc_tmp_rem = 0;
	uint32_t pmc_ipc_remainder = 0;
	uint32_t pmc_ipc_div = 0;
	uint32_t invoked_cnt_precision = 0;
	uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */

	if (rec->flags & TIME_BENCH_LOOP) {
	if (rec->invoked_cnt < 1000) {
	pr_err("ERR: need more(>1000) loops(%llu) for timing\n",
	rec->invoked_cnt);
	return false;
	}
	if (rec->invoked_cnt > ((1ULL << 32) - 1)) {
	/* div_u64_rem() can only support div with 32bit*/
	pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n",
	rec->invoked_cnt);
	return false;
	}
	invoked_cnt = (uint32_t)rec->invoked_cnt;
	}

	/* TSC (Time-Stamp Counter) records */
	if (rec->flags & TIME_BENCH_TSC) {
	rec->tsc_interval = rec->tsc_stop - rec->tsc_start;
	if (rec->tsc_interval == 0) {
	pr_err("ABORT: timing took ZERO TSC time\n");
	return false;
	}
	/* Calculate stats */
	if (rec->flags & TIME_BENCH_LOOP)
	rec->tsc_cycles = rec->tsc_interval / invoked_cnt;
	else
	rec->tsc_cycles = rec->tsc_interval;
	}

	/* Wall-clock time calc */
	if (rec->flags & TIME_BENCH_WALLCLOCK) {
	rec->time_start = rec->ts_start.tv_nsec +
	(NANOSEC_PER_SEC * rec->ts_start.tv_sec);
	rec->time_stop = rec->ts_stop.tv_nsec +
	(NANOSEC_PER_SEC * rec->ts_stop.tv_sec);
	rec->time_interval = rec->time_stop - rec->time_start;
	if (rec->time_interval == 0) {
	pr_err("ABORT: timing took ZERO wallclock time\n");
	return false;
	}
	/* Calculate stats */
	/* Division in kernel it tricky */
	/* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */
	/* remainder only correct because NANOSEC_PER_SEC is 10^9 */
	rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC,
	&rec->time_sec_remainder);
	//TODO: use existing struct timespec records instead of div?

	if (rec->flags & TIME_BENCH_LOOP) {
	/* Division in kernel it tricky */
	/* Orig: ns = ((double)time_interval / invoked_cnt); */
	/* First get quotient */
	rec->ns_per_call_quotient =
	div_u64_rem(rec->time_interval, invoked_cnt,
	&ns_per_call_remainder);
	/* Now get decimals .xxx precision (incorrect roundup)*/
	ns_per_call_tmp_rem = ns_per_call_remainder;
	invoked_cnt_precision = invoked_cnt / 1000;
	if (invoked_cnt_precision > 0) {
	rec->ns_per_call_decimal =
	div_u64_rem(ns_per_call_tmp_rem,
	invoked_cnt_precision,
	&ns_per_call_remainder);
	}
	}
	}

	/* Performance Monitor Unit (PMU) counters */
	if (rec->flags & TIME_BENCH_PMU) {
	//FIXME: Overflow handling???
	rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start;
	rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start;

	/* Calc Instruction Per Cycle (IPC) */
	/* First get quotient */
	rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk,
	&pmc_ipc_remainder);
	/* Now get decimals .xxx precision (incorrect roundup)*/
	pmc_ipc_tmp_rem = pmc_ipc_remainder;
	pmc_ipc_div = rec->pmc_clk / 1000;
	if (pmc_ipc_div > 0) {
	rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem,
	pmc_ipc_div,
	&pmc_ipc_remainder);
	}
	}

	return true;
	}

	/* Generic function for invoking a loop function and calculating
	* execution time stats. The function being called/timed is assumed
	* to perform a tight loop, and update the timing record struct.
	*/
	bool time_bench_loop(uint32_t loops, int step, char txt, void data,
	int (func)(struct time_bench_record record, void *data))
	{
	struct time_bench_record rec;

	/* Setup record */
	memset(&rec, 0, sizeof(rec)); /* zero func might not update all */
	rec.version_abi = 1;
	rec.loops = loops;
	rec.step = step;
	rec.flags = (TIME_BENCH_LOOP \| TIME_BENCH_TSC \| TIME_BENCH_WALLCLOCK);

	/* Loop function being timed */
	if (!func(&rec, data)) {
	pr_err("ABORT: function being timed failed\n");
	return false;
	}

	if (rec.invoked_cnt < loops)
	pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n",
	rec.invoked_cnt, loops);

	/* Calculate stats */
	time_bench_calc_stats(&rec);

	pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n",
	txt, rec.tsc_cycles, rec.ns_per_call_quotient,
	rec.ns_per_call_decimal, rec.step, rec.time_sec,
	rec.time_sec_remainder, rec.time_interval, rec.invoked_cnt,
	rec.tsc_interval);
	if (rec.flags & TIME_BENCH_PMU)
	pr_info("Type:%s PMU inst/clock%llu/%llu = %llu.%03llu IPC (inst per cycle)\n",
	txt, rec.pmc_inst, rec.pmc_clk, rec.pmc_ipc_quotient,
	rec.pmc_ipc_decimal);
	return true;
	}

	/* Function getting invoked by kthread */
	static int invoke_test_on_cpu_func(void *private)
	{
	struct time_bench_cpu *cpu = private;
	struct time_bench_sync *sync = cpu->sync;
	cpumask_t newmask = CPU_MASK_NONE;
	void *data = cpu->data;

	/* Restrict CPU */
	cpumask_set_cpu(cpu->rec.cpu, &newmask);
	set_cpus_allowed_ptr(current, &newmask);

	/* Synchronize start of concurrency test */
	atomic_inc(&sync->nr_tests_running);
	wait_for_completion(&sync->start_event);

	/* Start benchmark function */
	if (!cpu->bench_func(&cpu->rec, data)) {
	pr_err("ERROR: function being timed failed on CPU:%d(%d)\n",
	cpu->rec.cpu, smp_processor_id());
	} else {
	if (verbose)
	pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu,
	smp_processor_id());
	}
	cpu->did_bench_run = true;

	/* End test */
	atomic_dec(&sync->nr_tests_running);
	/* Wait for kthread_stop() telling us to stop */
	while (!kthread_should_stop()) {
	set_current_state(TASK_INTERRUPTIBLE);
	schedule();
	}
	__set_current_state(TASK_RUNNING);
	return 0;
	}

	void time_bench_print_stats_cpumask(const char *desc,
	struct time_bench_cpu *cpu_tasks,
	const struct cpumask *mask)
	{
	uint64_t average = 0;
	int cpu;
	int step = 0;
	struct sum {
	uint64_t tsc_cycles;
	int records;
	} sum = { 0 };

	/* Get stats */
	for_each_cpu(cpu, mask) {
	struct time_bench_cpu *c = &cpu_tasks[cpu];
	struct time_bench_record *rec = &c->rec;

	/* Calculate stats */
	time_bench_calc_stats(rec);

	pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n",
	desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient,
	rec->ns_per_call_decimal, rec->step, rec->time_sec,
	rec->time_sec_remainder, rec->time_interval,
	rec->invoked_cnt, rec->tsc_interval);

	/* Collect average */
	sum.records++;
	sum.tsc_cycles += rec->tsc_cycles;
	step = rec->step;
	}

	if (sum.records) /* avoid div-by-zero */
	average = sum.tsc_cycles / sum.records;
	pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc,
	average, sum.records, step);
	}

	void time_bench_run_concurrent(uint32_t loops, int step, void *data,
	const struct cpumask mask, / Support masking outsome CPUs*/
	struct time_bench_sync *sync,
	struct time_bench_cpu *cpu_tasks,
	int (func)(struct time_bench_record record, void *data))
	{
	int cpu, running = 0;

	if (verbose) // DEBUG
	pr_warn("%s() Started on CPU:%d\n", __func__,
	smp_processor_id());

	/* Reset sync conditions */
	atomic_set(&sync->nr_tests_running, 0);
	init_completion(&sync->start_event);

	/* Spawn off jobs on all CPUs */
	for_each_cpu(cpu, mask) {
	struct time_bench_cpu *c = &cpu_tasks[cpu];

	running++;
	c->sync = sync; /* Send sync variable along */
	c->data = data; /* Send opaque along */

	/* Init benchmark record */
	memset(&c->rec, 0, sizeof(struct time_bench_record));
	c->rec.version_abi = 1;
	c->rec.loops = loops;
	c->rec.step = step;
	c->rec.flags = (TIME_BENCH_LOOP \| TIME_BENCH_TSC \|
	TIME_BENCH_WALLCLOCK);
	c->rec.cpu = cpu;
	c->bench_func = func;
	c->task = kthread_run(invoke_test_on_cpu_func, c,
	"time_bench%d", cpu);
	if (IS_ERR(c->task)) {
	pr_err("%s(): Failed to start test func\n", __func__);
	return; /* Argh, what about cleanup?! */
	}
	}

	/* Wait until all processes are running */
	while (atomic_read(&sync->nr_tests_running) < running) {
	set_current_state(TASK_UNINTERRUPTIBLE);
	schedule_timeout(10);
	}
	/* Kick off all CPU concurrently on completion event */
	complete_all(&sync->start_event);

	/* Wait for CPUs to finish */
	while (atomic_read(&sync->nr_tests_running)) {
	set_current_state(TASK_UNINTERRUPTIBLE);
	schedule_timeout(10);
	}

	/* Stop the kthreads */
	for_each_cpu(cpu, mask) {
	struct time_bench_cpu *c = &cpu_tasks[cpu];

	kthread_stop(c->task);
	}

	if (verbose) // DEBUG - happens often, finish on another CPU
	pr_warn("%s() Finished on CPU:%d\n", __func__,
	smp_processor_id());
	}