arch/x86/kernel/cpu/resctrl/pseudo_lock.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Resource Director Technology (RDT)
  *
  * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
  *
  * Copyright (C) 2018 Intel Corporation
  *
  * Author: Reinette Chatre <reinette.chatre@intel.com>
  */

 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt

 #include <linux/cacheflush.h>
 #include <linux/cpu.h>
 #include <linux/perf_event.h>
 #include <linux/pm_qos.h>
 #include <linux/resctrl.h>

 #include <asm/cpu_device_id.h>
 #include <asm/perf_event.h>
 #include <asm/msr.h>

 #include "../../events/perf_event.h" /* For X86_CONFIG() */
 #include "internal.h"

 #define CREATE_TRACE_POINTS

 #include "pseudo_lock_trace.h"

 /*
  * The bits needed to disable hardware prefetching varies based on the
  * platform. During initialization we will discover which bits to use.
  */
 static u64 prefetch_disable_bits;

 /**
  * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
  *                                          platforms
  * @void: It takes no parameters.
  *
  * Capture the list of platforms that have been validated to support
  * pseudo-locking. This includes testing to ensure pseudo-locked regions
  * with low cache miss rates can be created under variety of load conditions
  * as well as that these pseudo-locked regions can maintain their low cache
  * miss rates under variety of load conditions for significant lengths of time.
  *
  * After a platform has been validated to support pseudo-locking its
  * hardware prefetch disable bits are included here as they are documented
  * in the SDM.
  *
  * When adding a platform here also add support for its cache events to
  * resctrl_arch_measure_l*_residency()
  *
  * Return:
  * If platform is supported, the bits to disable hardware prefetchers, 0
  * if platform is not supported.
  */
 u64 resctrl_arch_get_prefetch_disable_bits(void)
 {
 	prefetch_disable_bits = 0;

 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
 	    boot_cpu_data.x86 != 6)
 		return 0;

 	switch (boot_cpu_data.x86_vfm) {
 	case INTEL_BROADWELL_X:
 		/*
 		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
 		 * as:
 		 * 0    L2 Hardware Prefetcher Disable (R/W)
 		 * 1    L2 Adjacent Cache Line Prefetcher Disable (R/W)
 		 * 2    DCU Hardware Prefetcher Disable (R/W)
 		 * 3    DCU IP Prefetcher Disable (R/W)
 		 * 63:4 Reserved
 		 */
 		prefetch_disable_bits = 0xF;
 		break;
 	case INTEL_ATOM_GOLDMONT:
 	case INTEL_ATOM_GOLDMONT_PLUS:
 		/*
 		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
 		 * as:
 		 * 0     L2 Hardware Prefetcher Disable (R/W)
 		 * 1     Reserved
 		 * 2     DCU Hardware Prefetcher Disable (R/W)
 		 * 63:3  Reserved
 		 */
 		prefetch_disable_bits = 0x5;
 		break;
 	}

 	return prefetch_disable_bits;
 }

 /**
  * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
  * @_plr: the pseudo-lock region descriptor
  *
  * This is the core pseudo-locking flow.
  *
  * First we ensure that the kernel memory cannot be found in the cache.
  * Then, while taking care that there will be as little interference as
  * possible, the memory to be loaded is accessed while core is running
  * with class of service set to the bitmask of the pseudo-locked region.
  * After this is complete no future CAT allocations will be allowed to
  * overlap with this bitmask.
  *
  * Local register variables are utilized to ensure that the memory region
  * to be locked is the only memory access made during the critical locking
  * loop.
  *
  * Return: 0. Waiter on waitqueue will be woken on completion.
  */
 int resctrl_arch_pseudo_lock_fn(void *_plr)
 {
 	struct pseudo_lock_region *plr = _plr;
 	u32 rmid_p, closid_p;
 	unsigned long i;
 	u64 saved_msr;
 #ifdef CONFIG_KASAN
 	/*
 	 * The registers used for local register variables are also used
 	 * when KASAN is active. When KASAN is active we use a regular
 	 * variable to ensure we always use a valid pointer, but the cost
 	 * is that this variable will enter the cache through evicting the
 	 * memory we are trying to lock into the cache. Thus expect lower
 	 * pseudo-locking success rate when KASAN is active.
 	 */
 	unsigned int line_size;
 	unsigned int size;
 	void *mem_r;
 #else
 	register unsigned int line_size asm("esi");
 	register unsigned int size asm("edi");
 	register void *mem_r asm(_ASM_BX);
 #endif /* CONFIG_KASAN */

 	/*
 	 * Make sure none of the allocated memory is cached. If it is we
 	 * will get a cache hit in below loop from outside of pseudo-locked
 	 * region.
 	 * wbinvd (as opposed to clflush/clflushopt) is required to
 	 * increase likelihood that allocated cache portion will be filled
 	 * with associated memory.
 	 */
 	wbinvd();

 	/*
 	 * Always called with interrupts enabled. By disabling interrupts
 	 * ensure that we will not be preempted during this critical section.
 	 */
 	local_irq_disable();

 	/*
 	 * Call wrmsr and rdmsr as directly as possible to avoid tracing
 	 * clobbering local register variables or affecting cache accesses.
 	 *
 	 * Disable the hardware prefetcher so that when the end of the memory
 	 * being pseudo-locked is reached the hardware will not read beyond
 	 * the buffer and evict pseudo-locked memory read earlier from the
 	 * cache.
 	 */
 	saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
 	native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
 	closid_p = this_cpu_read(pqr_state.cur_closid);
 	rmid_p = this_cpu_read(pqr_state.cur_rmid);
 	mem_r = plr->kmem;
 	size = plr->size;
 	line_size = plr->line_size;
 	/*
 	 * Critical section begin: start by writing the closid associated
 	 * with the capacity bitmask of the cache region being
 	 * pseudo-locked followed by reading of kernel memory to load it
 	 * into the cache.
 	 */
 	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);

 	/*
 	 * Cache was flushed earlier. Now access kernel memory to read it
 	 * into cache region associated with just activated plr->closid.
 	 * Loop over data twice:
 	 * - In first loop the cache region is shared with the page walker
 	 *   as it populates the paging structure caches (including TLB).
 	 * - In the second loop the paging structure caches are used and
 	 *   cache region is populated with the memory being referenced.
 	 */
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		/*
 		 * Add a barrier to prevent speculative execution of this
 		 * loop reading beyond the end of the buffer.
 		 */
 		rmb();
 		asm volatile("mov (%0,%1,1), %%eax\n\t"
 			:
 			: "r" (mem_r), "r" (i)
 			: "%eax", "memory");
 	}
 	for (i = 0; i < size; i += line_size) {
 		/*
 		 * Add a barrier to prevent speculative execution of this
 		 * loop reading beyond the end of the buffer.
 		 */
 		rmb();
 		asm volatile("mov (%0,%1,1), %%eax\n\t"
 			:
 			: "r" (mem_r), "r" (i)
 			: "%eax", "memory");
 	}
 	/*
 	 * Critical section end: restore closid with capacity bitmask that
 	 * does not overlap with pseudo-locked region.
 	 */
 	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);

 	/* Re-enable the hardware prefetcher(s) */
 	wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
 	local_irq_enable();

 	plr->thread_done = 1;
 	wake_up_interruptible(&plr->lock_thread_wq);
 	return 0;
 }

 /**
  * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
  *                                      pseudo-locked memory
  * @_plr: pseudo-lock region to measure
  *
  * There is no deterministic way to test if a memory region is cached. One
  * way is to measure how long it takes to read the memory, the speed of
  * access is a good way to learn how close to the cpu the data was. Even
  * more, if the prefetcher is disabled and the memory is read at a stride
  * of half the cache line, then a cache miss will be easy to spot since the
  * read of the first half would be significantly slower than the read of
  * the second half.
  *
  * Return: 0. Waiter on waitqueue will be woken on completion.
  */
 int resctrl_arch_measure_cycles_lat_fn(void *_plr)
 {
 	struct pseudo_lock_region *plr = _plr;
 	u32 saved_low, saved_high;
 	unsigned long i;
 	u64 start, end;
 	void *mem_r;

 	local_irq_disable();
 	/*
 	 * Disable hardware prefetchers.
 	 */
 	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
 	mem_r = READ_ONCE(plr->kmem);
 	/*
 	 * Dummy execute of the time measurement to load the needed
 	 * instructions into the L1 instruction cache.
 	 */
 	start = rdtsc_ordered();
 	for (i = 0; i < plr->size; i += 32) {
 		start = rdtsc_ordered();
 		asm volatile("mov (%0,%1,1), %%eax\n\t"
 			     :
 			     : "r" (mem_r), "r" (i)
 			     : "%eax", "memory");
 		end = rdtsc_ordered();
 		trace_pseudo_lock_mem_latency((u32)(end - start));
 	}
 	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	local_irq_enable();
 	plr->thread_done = 1;
 	wake_up_interruptible(&plr->lock_thread_wq);
 	return 0;
 }

 /*
  * Create a perf_event_attr for the hit and miss perf events that will
  * be used during the performance measurement. A perf_event maintains
  * a pointer to its perf_event_attr so a unique attribute structure is
  * created for each perf_event.
  *
  * The actual configuration of the event is set right before use in order
  * to use the X86_CONFIG macro.
  */
 static struct perf_event_attr perf_miss_attr = {
 	.type		= PERF_TYPE_RAW,
 	.size		= sizeof(struct perf_event_attr),
 	.pinned		= 1,
 	.disabled	= 0,
 	.exclude_user	= 1,
 };

 static struct perf_event_attr perf_hit_attr = {
 	.type		= PERF_TYPE_RAW,
 	.size		= sizeof(struct perf_event_attr),
 	.pinned		= 1,
 	.disabled	= 0,
 	.exclude_user	= 1,
 };

 struct residency_counts {
 	u64 miss_before, hits_before;
 	u64 miss_after,  hits_after;
 };

 static int measure_residency_fn(struct perf_event_attr *miss_attr,
 				struct perf_event_attr *hit_attr,
 				struct pseudo_lock_region *plr,
 				struct residency_counts *counts)
 {
 	u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
 	struct perf_event *miss_event, *hit_event;
 	int hit_pmcnum, miss_pmcnum;
 	u32 saved_low, saved_high;
 	unsigned int line_size;
 	unsigned int size;
 	unsigned long i;
 	void *mem_r;
 	u64 tmp;

 	miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
 						      NULL, NULL, NULL);
 	if (IS_ERR(miss_event))
 		goto out;

 	hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
 						     NULL, NULL, NULL);
 	if (IS_ERR(hit_event))
 		goto out_miss;

 	local_irq_disable();
 	/*
 	 * Check any possible error state of events used by performing
 	 * one local read.
 	 */
 	if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
 		local_irq_enable();
 		goto out_hit;
 	}
 	if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
 		local_irq_enable();
 		goto out_hit;
 	}

 	/*
 	 * Disable hardware prefetchers.
 	 */
 	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);

 	/* Initialize rest of local variables */
 	/*
 	 * Performance event has been validated right before this with
 	 * interrupts disabled - it is thus safe to read the counter index.
 	 */
 	miss_pmcnum = x86_perf_rdpmc_index(miss_event);
 	hit_pmcnum = x86_perf_rdpmc_index(hit_event);
 	line_size = READ_ONCE(plr->line_size);
 	mem_r = READ_ONCE(plr->kmem);
 	size = READ_ONCE(plr->size);

 	/*
 	 * Read counter variables twice - first to load the instructions
 	 * used in L1 cache, second to capture accurate value that does not
 	 * include cache misses incurred because of instruction loads.
 	 */
 	hits_before = rdpmc(hit_pmcnum);
 	miss_before = rdpmc(miss_pmcnum);
 	/*
 	 * From SDM: Performing back-to-back fast reads are not guaranteed
 	 * to be monotonic.
 	 * Use LFENCE to ensure all previous instructions are retired
 	 * before proceeding.
 	 */
 	rmb();
 	hits_before = rdpmc(hit_pmcnum);
 	miss_before = rdpmc(miss_pmcnum);
 	/*
 	 * Use LFENCE to ensure all previous instructions are retired
 	 * before proceeding.
 	 */
 	rmb();
 	for (i = 0; i < size; i += line_size) {
 		/*
 		 * Add a barrier to prevent speculative execution of this
 		 * loop reading beyond the end of the buffer.
 		 */
 		rmb();
 		asm volatile("mov (%0,%1,1), %%eax\n\t"
 			     :
 			     : "r" (mem_r), "r" (i)
 			     : "%eax", "memory");
 	}
 	/*
 	 * Use LFENCE to ensure all previous instructions are retired
 	 * before proceeding.
 	 */
 	rmb();
 	hits_after = rdpmc(hit_pmcnum);
 	miss_after = rdpmc(miss_pmcnum);
 	/*
 	 * Use LFENCE to ensure all previous instructions are retired
 	 * before proceeding.
 	 */
 	rmb();
 	/* Re-enable hardware prefetchers */
 	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	local_irq_enable();
 out_hit:
 	perf_event_release_kernel(hit_event);
 out_miss:
 	perf_event_release_kernel(miss_event);
 out:
 	/*
 	 * All counts will be zero on failure.
 	 */
 	counts->miss_before = miss_before;
 	counts->hits_before = hits_before;
 	counts->miss_after  = miss_after;
 	counts->hits_after  = hits_after;
 	return 0;
 }

 int resctrl_arch_measure_l2_residency(void *_plr)
 {
 	struct pseudo_lock_region *plr = _plr;
 	struct residency_counts counts = {0};

 	/*
 	 * Non-architectural event for the Goldmont Microarchitecture
 	 * from Intel x86 Architecture Software Developer Manual (SDM):
 	 * MEM_LOAD_UOPS_RETIRED D1H (event number)
 	 * Umask values:
 	 *     L2_HIT   02H
 	 *     L2_MISS  10H
 	 */
 	switch (boot_cpu_data.x86_vfm) {
 	case INTEL_ATOM_GOLDMONT:
 	case INTEL_ATOM_GOLDMONT_PLUS:
 		perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
 						   .umask = 0x10);
 		perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
 						  .umask = 0x2);
 		break;
 	default:
 		goto out;
 	}

 	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
 	/*
 	 * If a failure prevented the measurements from succeeding
 	 * tracepoints will still be written and all counts will be zero.
 	 */
 	trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
 			     counts.miss_after - counts.miss_before);
 out:
 	plr->thread_done = 1;
 	wake_up_interruptible(&plr->lock_thread_wq);
 	return 0;
 }

 int resctrl_arch_measure_l3_residency(void *_plr)
 {
 	struct pseudo_lock_region *plr = _plr;
 	struct residency_counts counts = {0};

 	/*
 	 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
 	 * has two "no fix" errata associated with it: BDM35 and BDM100. On
 	 * this platform the following events are used instead:
 	 * LONGEST_LAT_CACHE 2EH (Documented in SDM)
 	 *       REFERENCE 4FH
 	 *       MISS      41H
 	 */

 	switch (boot_cpu_data.x86_vfm) {
 	case INTEL_BROADWELL_X:
 		/* On BDW the hit event counts references, not hits */
 		perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
 						  .umask = 0x4f);
 		perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
 						   .umask = 0x41);
 		break;
 	default:
 		goto out;
 	}

 	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
 	/*
 	 * If a failure prevented the measurements from succeeding
 	 * tracepoints will still be written and all counts will be zero.
 	 */

 	counts.miss_after -= counts.miss_before;
 	if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
 		/*
 		 * On BDW references and misses are counted, need to adjust.
 		 * Sometimes the "hits" counter is a bit more than the
 		 * references, for example, x references but x + 1 hits.
 		 * To not report invalid hit values in this case we treat
 		 * that as misses equal to references.
 		 */
 		/* First compute the number of cache references measured */
 		counts.hits_after -= counts.hits_before;
 		/* Next convert references to cache hits */
 		counts.hits_after -= min(counts.miss_after, counts.hits_after);
 	} else {
 		counts.hits_after -= counts.hits_before;
 	}

 	trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
 out:
 	plr->thread_done = 1;
 	wake_up_interruptible(&plr->lock_thread_wq);
 	return 0;
 }
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Resource Director Technology (RDT)
	*
	* Pseudo-locking support built on top of Cache Allocation Technology (CAT)
	*
	* Copyright (C) 2018 Intel Corporation
	*
	* Author: Reinette Chatre <reinette.chatre@intel.com>
	*/

	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

	#include <linux/cacheflush.h>
	#include <linux/cpu.h>
	#include <linux/perf_event.h>
	#include <linux/pm_qos.h>
	#include <linux/resctrl.h>

	#include <asm/cpu_device_id.h>
	#include <asm/perf_event.h>
	#include <asm/msr.h>

	#include "../../events/perf_event.h" /* For X86_CONFIG() */
	#include "internal.h"

	#define CREATE_TRACE_POINTS

	#include "pseudo_lock_trace.h"

	/*
	* The bits needed to disable hardware prefetching varies based on the
	* platform. During initialization we will discover which bits to use.
	*/
	static u64 prefetch_disable_bits;

	/**
	* resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
	* platforms
	* @void: It takes no parameters.
	*
	* Capture the list of platforms that have been validated to support
	* pseudo-locking. This includes testing to ensure pseudo-locked regions
	* with low cache miss rates can be created under variety of load conditions
	* as well as that these pseudo-locked regions can maintain their low cache
	* miss rates under variety of load conditions for significant lengths of time.
	*
	* After a platform has been validated to support pseudo-locking its
	* hardware prefetch disable bits are included here as they are documented
	* in the SDM.
	*
	* When adding a platform here also add support for its cache events to
	* resctrl_arch_measure_l*_residency()
	*
	* Return:
	* If platform is supported, the bits to disable hardware prefetchers, 0
	* if platform is not supported.
	*/
	u64 resctrl_arch_get_prefetch_disable_bits(void)
	{
	prefetch_disable_bits = 0;

	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL \|\|
	boot_cpu_data.x86 != 6)
	return 0;

	switch (boot_cpu_data.x86_vfm) {
	case INTEL_BROADWELL_X:
	/*
	* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
	* as:
	* 0 L2 Hardware Prefetcher Disable (R/W)
	* 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
	* 2 DCU Hardware Prefetcher Disable (R/W)
	* 3 DCU IP Prefetcher Disable (R/W)
	* 63:4 Reserved
	*/
	prefetch_disable_bits = 0xF;
	break;
	case INTEL_ATOM_GOLDMONT:
	case INTEL_ATOM_GOLDMONT_PLUS:
	/*
	* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
	* as:
	* 0 L2 Hardware Prefetcher Disable (R/W)
	* 1 Reserved
	* 2 DCU Hardware Prefetcher Disable (R/W)
	* 63:3 Reserved
	*/
	prefetch_disable_bits = 0x5;
	break;
	}

	return prefetch_disable_bits;
	}

	/**
	* resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
	* @_plr: the pseudo-lock region descriptor
	*
	* This is the core pseudo-locking flow.
	*
	* First we ensure that the kernel memory cannot be found in the cache.
	* Then, while taking care that there will be as little interference as
	* possible, the memory to be loaded is accessed while core is running
	* with class of service set to the bitmask of the pseudo-locked region.
	* After this is complete no future CAT allocations will be allowed to
	* overlap with this bitmask.
	*
	* Local register variables are utilized to ensure that the memory region
	* to be locked is the only memory access made during the critical locking
	* loop.
	*
	* Return: 0. Waiter on waitqueue will be woken on completion.
	*/
	int resctrl_arch_pseudo_lock_fn(void *_plr)
	{
	struct pseudo_lock_region *plr = _plr;
	u32 rmid_p, closid_p;
	unsigned long i;
	u64 saved_msr;
	#ifdef CONFIG_KASAN
	/*
	* The registers used for local register variables are also used
	* when KASAN is active. When KASAN is active we use a regular
	* variable to ensure we always use a valid pointer, but the cost
	* is that this variable will enter the cache through evicting the
	* memory we are trying to lock into the cache. Thus expect lower
	* pseudo-locking success rate when KASAN is active.
	*/
	unsigned int line_size;
	unsigned int size;
	void *mem_r;
	#else
	register unsigned int line_size asm("esi");
	register unsigned int size asm("edi");
	register void *mem_r asm(_ASM_BX);
	#endif /* CONFIG_KASAN */

	/*
	* Make sure none of the allocated memory is cached. If it is we
	* will get a cache hit in below loop from outside of pseudo-locked
	* region.
	* wbinvd (as opposed to clflush/clflushopt) is required to
	* increase likelihood that allocated cache portion will be filled
	* with associated memory.
	*/
	wbinvd();

	/*
	* Always called with interrupts enabled. By disabling interrupts
	* ensure that we will not be preempted during this critical section.
	*/
	local_irq_disable();

	/*
	* Call wrmsr and rdmsr as directly as possible to avoid tracing
	* clobbering local register variables or affecting cache accesses.
	*
	* Disable the hardware prefetcher so that when the end of the memory
	* being pseudo-locked is reached the hardware will not read beyond
	* the buffer and evict pseudo-locked memory read earlier from the
	* cache.
	*/
	saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
	native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
	closid_p = this_cpu_read(pqr_state.cur_closid);
	rmid_p = this_cpu_read(pqr_state.cur_rmid);
	mem_r = plr->kmem;
	size = plr->size;
	line_size = plr->line_size;
	/*
	* Critical section begin: start by writing the closid associated
	* with the capacity bitmask of the cache region being
	* pseudo-locked followed by reading of kernel memory to load it
	* into the cache.
	*/
	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);

	/*
	* Cache was flushed earlier. Now access kernel memory to read it
	* into cache region associated with just activated plr->closid.
	* Loop over data twice:
	* - In first loop the cache region is shared with the page walker
	* as it populates the paging structure caches (including TLB).
	* - In the second loop the paging structure caches are used and
	* cache region is populated with the memory being referenced.
	*/
	for (i = 0; i < size; i += PAGE_SIZE) {
	/*
	* Add a barrier to prevent speculative execution of this
	* loop reading beyond the end of the buffer.
	*/
	rmb();
	asm volatile("mov (%0,%1,1), %%eax\n\t"
	:
	: "r" (mem_r), "r" (i)
	: "%eax", "memory");
	}
	for (i = 0; i < size; i += line_size) {
	/*
	* Add a barrier to prevent speculative execution of this
	* loop reading beyond the end of the buffer.
	*/
	rmb();
	asm volatile("mov (%0,%1,1), %%eax\n\t"
	:
	: "r" (mem_r), "r" (i)
	: "%eax", "memory");
	}
	/*
	* Critical section end: restore closid with capacity bitmask that
	* does not overlap with pseudo-locked region.
	*/
	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);

	/* Re-enable the hardware prefetcher(s) */
	wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
	local_irq_enable();

	plr->thread_done = 1;
	wake_up_interruptible(&plr->lock_thread_wq);
	return 0;
	}

	/**
	* resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
	* pseudo-locked memory
	* @_plr: pseudo-lock region to measure
	*
	* There is no deterministic way to test if a memory region is cached. One
	* way is to measure how long it takes to read the memory, the speed of
	* access is a good way to learn how close to the cpu the data was. Even
	* more, if the prefetcher is disabled and the memory is read at a stride
	* of half the cache line, then a cache miss will be easy to spot since the
	* read of the first half would be significantly slower than the read of
	* the second half.
	*
	* Return: 0. Waiter on waitqueue will be woken on completion.
	*/
	int resctrl_arch_measure_cycles_lat_fn(void *_plr)
	{
	struct pseudo_lock_region *plr = _plr;
	u32 saved_low, saved_high;
	unsigned long i;
	u64 start, end;
	void *mem_r;

	local_irq_disable();
	/*
	* Disable hardware prefetchers.
	*/
	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
	mem_r = READ_ONCE(plr->kmem);
	/*
	* Dummy execute of the time measurement to load the needed
	* instructions into the L1 instruction cache.
	*/
	start = rdtsc_ordered();
	for (i = 0; i < plr->size; i += 32) {
	start = rdtsc_ordered();
	asm volatile("mov (%0,%1,1), %%eax\n\t"
	:
	: "r" (mem_r), "r" (i)
	: "%eax", "memory");
	end = rdtsc_ordered();
	trace_pseudo_lock_mem_latency((u32)(end - start));
	}
	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
	local_irq_enable();
	plr->thread_done = 1;
	wake_up_interruptible(&plr->lock_thread_wq);
	return 0;
	}

	/*
	* Create a perf_event_attr for the hit and miss perf events that will
	* be used during the performance measurement. A perf_event maintains
	* a pointer to its perf_event_attr so a unique attribute structure is
	* created for each perf_event.
	*
	* The actual configuration of the event is set right before use in order
	* to use the X86_CONFIG macro.
	*/
	static struct perf_event_attr perf_miss_attr = {
	.type = PERF_TYPE_RAW,
	.size = sizeof(struct perf_event_attr),
	.pinned = 1,
	.disabled = 0,
	.exclude_user = 1,
	};

	static struct perf_event_attr perf_hit_attr = {
	.type = PERF_TYPE_RAW,
	.size = sizeof(struct perf_event_attr),
	.pinned = 1,
	.disabled = 0,
	.exclude_user = 1,
	};

	struct residency_counts {
	u64 miss_before, hits_before;
	u64 miss_after, hits_after;
	};

	static int measure_residency_fn(struct perf_event_attr *miss_attr,
	struct perf_event_attr *hit_attr,
	struct pseudo_lock_region *plr,
	struct residency_counts *counts)
	{
	u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
	struct perf_event miss_event, hit_event;
	int hit_pmcnum, miss_pmcnum;
	u32 saved_low, saved_high;
	unsigned int line_size;
	unsigned int size;
	unsigned long i;
	void *mem_r;
	u64 tmp;

	miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
	NULL, NULL, NULL);
	if (IS_ERR(miss_event))
	goto out;

	hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
	NULL, NULL, NULL);
	if (IS_ERR(hit_event))
	goto out_miss;

	local_irq_disable();
	/*
	* Check any possible error state of events used by performing
	* one local read.
	*/
	if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
	local_irq_enable();
	goto out_hit;
	}
	if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
	local_irq_enable();
	goto out_hit;
	}

	/*
	* Disable hardware prefetchers.
	*/
	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);

	/* Initialize rest of local variables */
	/*
	* Performance event has been validated right before this with
	* interrupts disabled - it is thus safe to read the counter index.
	*/
	miss_pmcnum = x86_perf_rdpmc_index(miss_event);
	hit_pmcnum = x86_perf_rdpmc_index(hit_event);
	line_size = READ_ONCE(plr->line_size);
	mem_r = READ_ONCE(plr->kmem);
	size = READ_ONCE(plr->size);

	/*
	* Read counter variables twice - first to load the instructions
	* used in L1 cache, second to capture accurate value that does not
	* include cache misses incurred because of instruction loads.
	*/
	hits_before = rdpmc(hit_pmcnum);
	miss_before = rdpmc(miss_pmcnum);
	/*
	* From SDM: Performing back-to-back fast reads are not guaranteed
	* to be monotonic.
	* Use LFENCE to ensure all previous instructions are retired
	* before proceeding.
	*/
	rmb();
	hits_before = rdpmc(hit_pmcnum);
	miss_before = rdpmc(miss_pmcnum);
	/*
	* Use LFENCE to ensure all previous instructions are retired
	* before proceeding.
	*/
	rmb();
	for (i = 0; i < size; i += line_size) {
	/*
	* Add a barrier to prevent speculative execution of this
	* loop reading beyond the end of the buffer.
	*/
	rmb();
	asm volatile("mov (%0,%1,1), %%eax\n\t"
	:
	: "r" (mem_r), "r" (i)
	: "%eax", "memory");
	}
	/*
	* Use LFENCE to ensure all previous instructions are retired
	* before proceeding.
	*/
	rmb();
	hits_after = rdpmc(hit_pmcnum);
	miss_after = rdpmc(miss_pmcnum);
	/*
	* Use LFENCE to ensure all previous instructions are retired
	* before proceeding.
	*/
	rmb();
	/* Re-enable hardware prefetchers */
	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
	local_irq_enable();
	out_hit:
	perf_event_release_kernel(hit_event);
	out_miss:
	perf_event_release_kernel(miss_event);
	out:
	/*
	* All counts will be zero on failure.
	*/
	counts->miss_before = miss_before;
	counts->hits_before = hits_before;
	counts->miss_after = miss_after;
	counts->hits_after = hits_after;
	return 0;
	}

	int resctrl_arch_measure_l2_residency(void *_plr)
	{
	struct pseudo_lock_region *plr = _plr;
	struct residency_counts counts = {0};

	/*
	* Non-architectural event for the Goldmont Microarchitecture
	* from Intel x86 Architecture Software Developer Manual (SDM):
	* MEM_LOAD_UOPS_RETIRED D1H (event number)
	* Umask values:
	* L2_HIT 02H
	* L2_MISS 10H
	*/
	switch (boot_cpu_data.x86_vfm) {
	case INTEL_ATOM_GOLDMONT:
	case INTEL_ATOM_GOLDMONT_PLUS:
	perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
	.umask = 0x10);
	perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
	.umask = 0x2);
	break;
	default:
	goto out;
	}

	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
	/*
	* If a failure prevented the measurements from succeeding
	* tracepoints will still be written and all counts will be zero.
	*/
	trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
	counts.miss_after - counts.miss_before);
	out:
	plr->thread_done = 1;
	wake_up_interruptible(&plr->lock_thread_wq);
	return 0;
	}

	int resctrl_arch_measure_l3_residency(void *_plr)
	{
	struct pseudo_lock_region *plr = _plr;
	struct residency_counts counts = {0};

	/*
	* On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
	* has two "no fix" errata associated with it: BDM35 and BDM100. On
	* this platform the following events are used instead:
	* LONGEST_LAT_CACHE 2EH (Documented in SDM)
	* REFERENCE 4FH
	* MISS 41H
	*/

	switch (boot_cpu_data.x86_vfm) {
	case INTEL_BROADWELL_X:
	/* On BDW the hit event counts references, not hits */
	perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
	.umask = 0x4f);
	perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
	.umask = 0x41);
	break;
	default:
	goto out;
	}

	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
	/*
	* If a failure prevented the measurements from succeeding
	* tracepoints will still be written and all counts will be zero.
	*/

	counts.miss_after -= counts.miss_before;
	if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
	/*
	* On BDW references and misses are counted, need to adjust.
	* Sometimes the "hits" counter is a bit more than the
	* references, for example, x references but x + 1 hits.
	* To not report invalid hit values in this case we treat
	* that as misses equal to references.
	*/
	/* First compute the number of cache references measured */
	counts.hits_after -= counts.hits_before;
	/* Next convert references to cache hits */
	counts.hits_after -= min(counts.miss_after, counts.hits_after);
	} else {
	counts.hits_after -= counts.hits_before;
	}

	trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
	out:
	plr->thread_done = 1;
	wake_up_interruptible(&plr->lock_thread_wq);
	return 0;
	}