Blame SOURCES/oprofile-knl.patch

dcd44d
commit 8050eb1d3095cc3b1e7a3344c85be4d9c591c089
dcd44d
Author: Michael Petlan <mpetlan@redhat.com>
dcd44d
Date:   Tue May 16 23:21:47 2017 +0200
dcd44d
dcd44d
    oprofile: Add support for Intel Xeon Phi (Knights Landing)
dcd44d
    
dcd44d
    Adds support for Intel Xeon Phi (Knights Landing and Knights Mill)
dcd44d
    processors to oprofile. Only core events are supported.
dcd44d
    
dcd44d
    The events/umasks configuration has been taken from Intel Xeon Phi
dcd44d
    Processor Performance Monitoring Reference Manual, volume 2. All
dcd44d
    the events were tested on a Knights Mill machine.
dcd44d
    
dcd44d
    Signed-off-by: Michael Petlan <mpetlan@redhat.com>
dcd44d
dcd44d
diff --git a/events/Makefile.am b/events/Makefile.am
dcd44d
index b8f06af..13d063a 100644
dcd44d
--- a/events/Makefile.am
dcd44d
+++ b/events/Makefile.am
dcd44d
@@ -21,6 +21,7 @@ event_files = \
dcd44d
 	i386/skylake/events i386/skylake/unit_masks \
dcd44d
 	i386/silvermont/events i386/silvermont/unit_masks \
dcd44d
 	i386/goldmont/events i386/goldmont/unit_masks \
dcd44d
+	i386/knightslanding/events i386/knightslanding/unit_masks \
dcd44d
 	ia64/ia64/events ia64/ia64/unit_masks \
dcd44d
 	ia64/itanium2/events ia64/itanium2/unit_masks \
dcd44d
 	ia64/itanium/events ia64/itanium/unit_masks \
dcd44d
diff --git a/events/i386/knightslanding/events b/events/i386/knightslanding/events
dcd44d
new file mode 100644
dcd44d
index 0000000..d34feca
dcd44d
--- /dev/null
dcd44d
+++ b/events/i386/knightslanding/events
dcd44d
@@ -0,0 +1,26 @@
dcd44d
+#
dcd44d
+# Intel "Knights Landing" microarchitecture core events.
dcd44d
+#
dcd44d
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
dcd44d
+#
dcd44d
+# Note the minimum counts are not discovered experimentally and could be likely
dcd44d
+# lowered in many cases without ill effect.
dcd44d
+#
dcd44d
+include:i386/arch_perfmon
dcd44d
+event:0x03 counters:cpuid um:recycleq minimum:20000 name:recycleq : Counts the number of retired load or store micro-ops that get pushed into the Recycle Queue
dcd44d
+event:0x04 counters:cpuid um:mem_uops_retired minimum:100000 name:mem_uops_retired : Counts the number of memory micro-ops retired.
dcd44d
+event:0x05 counters:cpuid um:page_walks minimum:500 name:page_walks : Counts the number of core cycles for page walks
dcd44d
+event:0x30 counters:cpuid um:l2_requests_reject minimum:500 name:l2_requests_reject : Counts the number of MEC requests from the L2Q that reference a cache line were rejected.
dcd44d
+event:0x31 counters:cpuid um:core_reject_l2q minimum:100 name:core_reject_l2q : Number of requests not accepted into the L2Q because of any L2 queue reject condition.
dcd44d
+event:0x80 counters:cpuid um:icache minimum:100000 name:icache : Instruction fetches
dcd44d
+event:0x86 counters:cpuid um:fetch_stall minimum:100000 name:fetch_stall : Counts the number of core cycles the instruction fetch pipe was stalls
dcd44d
+event:0x2e counters:cpuid um:l2_requests minimum:10000 name:l2_requests : L2 cache requests
dcd44d
+event:0xc2 counters:cpuid um:uops_retired minimum:100000 name:uops_retired : Retired uops
dcd44d
+event:0xc3 counters:cpuid um:machine_clears minimum:500 name:machine_clears : Counts the number of times that the machine clears at retire.
dcd44d
+event:0xc4 counters:cpuid um:br_inst_retired minimum:50000 name:br_inst_retired : Counts the number of branch instructions retired
dcd44d
+event:0xc5 counters:cpuid um:br_misp_retired minimum:5000 name:br_misp_retired : Counts the number of mispredicted branch instructions retired
dcd44d
+event:0xca counters:cpuid um:no_alloc_cycles minimum:500000 name:no_alloc_cycles : Counts the number of core cycles when no micro-ops are allocated
dcd44d
+event:0xcb counters:cpuid um:rs_full_stall minimum:100000 name:rs_full_stall : Counts the number of core cycles when the allocate stalls because the required RS is full.
dcd44d
+event:0xcd counters:cpuid um:cycles_div_busy minimum:1000 name:cycles_div_busy : Number of core cycles when divider is busy
dcd44d
+event:0xe6 counters:cpuid um:baclears minimum:10000 name:baclears : Counts the number of times Branch Target Buffer (BTB) prediction was corrected by a later branch predictor
dcd44d
+event:0xe7 counters:cpuid um:ms_decoded minimum:10000 name:ms_decoded : Microcode sequencer decode entrypoints
dcd44d
diff --git a/events/i386/knightslanding/unit_masks b/events/i386/knightslanding/unit_masks
dcd44d
new file mode 100644
dcd44d
index 0000000..b0e7910
dcd44d
--- /dev/null
dcd44d
+++ b/events/i386/knightslanding/unit_masks
dcd44d
@@ -0,0 +1,91 @@
dcd44d
+#
dcd44d
+# Unit masks for the Intel "Knights Landing" micro architecture
dcd44d
+#
dcd44d
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
dcd44d
+#
dcd44d
+include:i386/arch_perfmon
dcd44d
+name:recycleq type:exclusive default:any_ld
dcd44d
+	0x01 extra:pebs ld_block_st_forward Counts the number of occurrences a retired load gets blocked because its address partially overlaps with a store.
dcd44d
+	0x02 extra: ld_block_std_notready Counts the number of occurrences a retired load gets blocked because its address overlaps with a store whose data is not ready.
dcd44d
+	0x04 extra: st_splits Counts the number of occurrences a retired store that is a cache line split. Each split should be counted only once.
dcd44d
+	0x08 extra:pebs ld_splits Counts the number of occurrences a retired load that is a cache line split. Each split should be counted only once.
dcd44d
+	0x10 extra: lock Counts all the retired locked loads. It does not include stores because we would double count if we count stores.
dcd44d
+	0x20 extra: sta_full Counts the store micro-ops retired that were pushed in the rehad queue because the store address buffer is full.
dcd44d
+	0x40 extra: any_ld Counts any retired load that was pushed into the recycle queue for any reason.
dcd44d
+	0x80 extra: any_st Counts any retired store that was pushed into the recycle queue for any reason.
dcd44d
+name:mem_uops_retired type:exclusive default:any_loads
dcd44d
+	0x01 extra: l1_miss_loads Counts the number of load micro-ops retired that miss in L1 D cache.
dcd44d
+	0x02 extra:pebs l2_hit_loads Counts the number of load micro-ops retired that hit in the L2.
dcd44d
+	0x04 extra:pebs l2_miss_loads Counts the number of load micro-ops retired that miss in the L2.
dcd44d
+	0x08 extra:pebs dtlb_miss_loads Counts the number of load micro-ops retired that cause a DTLB miss.
dcd44d
+	0x10 extra: utlb_miss_loads Counts the number of load micro-ops retired that caused micro TLB miss.
dcd44d
+	0x20 extra:pebs hitm Counts the loads retired that get the data from the other core in the same tile in M state.
dcd44d
+	0x40 extra: any_loads Counts all the load micro-ops retired.
dcd44d
+	0x80 extra: any_stores Counts all the store micro-ops retired.
dcd44d
+name:page_walks type:exclusive default:walks
dcd44d
+	0x01 extra:edge d_side_walks Counts the total D-side page walks that are completed or started. The page walks started in the speculative path will also be counted.
dcd44d
+	0x01 extra: d_side_cycles Counts the total number of core cycles for all the D-side page walks. The cycles for page walks started in speculative path will also be included.
dcd44d
+	0x02 extra:edge i_side_walks Counts the total I-side page walks that are completed.
dcd44d
+	0x02 extra: i_side_cycles Counts the total number of core cycles for all the I-side page walks. The cycles for page walks started in speculative path will also be included.
dcd44d
+	0x03 extra:edge walks Counts the total page walks completed (I-side and D-side)
dcd44d
+	0x03 extra: cycles Counts the total number of core cycles for all the page walks. The cycles for page walks started in speculative path will also be included.
dcd44d
+name:l2_requests_reject type:mandatory default:all
dcd44d
+	0x00 extra: all Counts the number of MEC requests from the L2Q that reference a cache line excluding SW prefetches filling only to L2 cache and L1 evictions (automatically exlcudes L2HWP, UC, WC) that were rejected - Multiple repeated rejects should be counted multiple times.
dcd44d
+name:core_reject_l2q type:mandatory default:all
dcd44d
+	0x00 extra: all  Counts the number of MEC requests that were not accepted into the L2Q because of any L2  queue reject condition. There is no concept of at-ret here. It might include requests due to instructions in the speculative path
dcd44d
+name:icache type:exclusive default:accesses
dcd44d
+	0x3 extra: accesses All instruction fetches including uncacheable
dcd44d
+	0x1 extra: hits All instruction fetches that hit instruction cache
dcd44d
+	0x2 extra: misses All instruction fetches that missed instruction cache (produced a memory request); counted only once, not once per outstanding cycle
dcd44d
+name:fetch_stall type:exclusive default:icache_fill_pending_cycles
dcd44d
+	0x01 extra: icache_fill_pending_cycles Counts the number of core cycles the fetch stalls because of an icache miss. This is a cumulative count of core cycles the fetch stalled for all icache misses
dcd44d
+	0x01 extra:edge icache_fill_pending_edge Counts the number of times it happens that fetch stalls because of an icache miss.
dcd44d
+name:l2_requests type:exclusive default:reference
dcd44d
+	0x41 extra: miss Counts the total number of L2 cache misses.
dcd44d
+	0x4f extra: reference Counts the total number of L2 cache references.
dcd44d
+name:uops_retired type:exclusive default:all
dcd44d
+	0x01 extra: ms Counts the number of uops retired that are from complex flows issued by the micro-sequencer
dcd44d
+	0x10 extra: all Counts the number of uops retired
dcd44d
+	0x20 extra: scalar_simd Counts the number of scalar SSE, AVX, AVX2, AVX-512 micro-ops except for loads (memory-to-register mov-type micro ops), division, sqrt.
dcd44d
+	0x40 extra: packed_simd Counts the number of packed SSE, AVX, AVX2, AVX-512 micro-ops (both floating point and integer) except for loads (memory-to-register mov-type micro-ops), packed byte and word multiplies.
dcd44d
+name:machine_clears type:exclusive default:all
dcd44d
+	0x01 extra: smc Counts the number of times that the machine clears due to program modifying data within 1K of a recently fetched code page.
dcd44d
+	0x02 extra: memory_ordering Counts the number of times the machine clears due to memory ordering hazards.
dcd44d
+	0x04 extra: fp_assist Counts the number of floating operations retired that required microcode assists
dcd44d
+	0x08 extra: all Counts all machine clears
dcd44d
+name:br_inst_retired type:exclusive default:any
dcd44d
+	0x00 extra:pebs any Counts the number of branch instructions retired
dcd44d
+	0x7e extra:pebs jcc Counts the number of branch instructions retired that were conditional jumps.
dcd44d
+	0xfe extra:pebs taken_jcc Counts the number of branch instructions retired that were conditional jumps and predicted taken.
dcd44d
+	0xf9 extra:pebs call Counts the number of near CALL branch instructions retired.
dcd44d
+	0xfd extra:pebs rel_call Counts the number of near relative CALL branch instructions retired.
dcd44d
+	0xfb extra:pebs ind_call Counts the number of near indirect CALL branch instructions retired.
dcd44d
+	0xf7 extra:pebs return Counts the number of near RET branch instructions retired.
dcd44d
+	0xeb extra:pebs non_return_ind Counts the number of branch instructions retired that were near indirect CALL or near indirect JMP.
dcd44d
+	0xbf extra:pebs far_branch Counts the number of far branch instructions retired.
dcd44d
+name:br_misp_retired type:exclusive default:any
dcd44d
+	0x00 extra:pebs any All mispredicted branches
dcd44d
+	0x7e extra:pebs jcc Number of mispredicted conditional branch instructions retired
dcd44d
+	0xfe extra:pebs taken_jcc Number of mispredicted taken conditional branch instructions retired
dcd44d
+	0xf9 extra:pebs call Counts the number of mispredicted near CALL branch instructions retired.
dcd44d
+	0xfd extra:pebs rel_call Counts the number of mispredicted near relative CALL branch instructions retired.
dcd44d
+	0xfb extra:pebs ind_call Number of mispredicted indirect call branch instructions retired
dcd44d
+	0xf7 extra:pebs return Number of mispredicted return branch instructions retired
dcd44d
+	0xeb extra:pebs non_return_ind Number of mispredicted non-return branch instructions retired
dcd44d
+	0xbf extra:pebs far_branch Counts the number of mispredicted far branch instructions retired.
dcd44d
+name:no_alloc_cycles type:exclusive default:all
dcd44d
+	0x01 extra: rob_full Counts the number of core cycles when no micro-ops are allocated and the ROB is full
dcd44d
+	0x02 extra: mispredicts Counts the number of core cycles when no micro-ops are allocated and the alloc pipe is stalled waiting for a mispredicted branch to retire.
dcd44d
+	0x20 extra: rat_stall Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted.
dcd44d
+	0x7f extra: all Counts the total number of core cycles when no micro-ops are allocated for any reason.
dcd44d
+name:rs_full_stall type:exclusive default:all
dcd44d
+	0x01 extra: mec Counts the number of core cycles when allocation pipeline is stalled and is waiting for a free MEC reservation station entry.
dcd44d
+	0x1f extra: all Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full.
dcd44d
+name:cycles_div_busy type:mandatory default:all
dcd44d
+	0x01 extra: all Cycles the number of core cycles when divider is busy, does not imply a stall waiting for the divider
dcd44d
+name:baclears type:exclusive default:all
dcd44d
+	0x01 extra: all Counts the number of times front-end resteers for any branch as a result of another branch handling mechanism in the front-end.
dcd44d
+	0x08 extra: return Counts the number of times the front-end resteers for RET branches as a result of another branch handling mechanism in the front-end.
dcd44d
+	0x10 extra: cond Counts the number of times the front-end resteers for conditional branches as a result of another branch handling mechanism in the front-end.
dcd44d
+name:ms_decoded type:mandatory default:ms_entry
dcd44d
+	0x01 extra: ms_entry Counts the number of times the MSROM starts a flow of uops.
dcd44d
diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c
dcd44d
index 7acecda..610121e 100644
dcd44d
--- a/libop/op_cpu_type.c
dcd44d
+++ b/libop/op_cpu_type.c
dcd44d
@@ -120,6 +120,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = {
dcd44d
 	{ "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 },
dcd44d
 	{ "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 },
dcd44d
 	{ "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 },
dcd44d
+	{ "Intel Knights Landing", "i386/knightslanding", CPU_KNIGHTSLANDING, 4 },
dcd44d
 };
dcd44d
  
dcd44d
 static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr);
dcd44d
@@ -752,6 +753,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type)
dcd44d
 	case CPU_WESTMERE:
dcd44d
 	case CPU_SANDYBRIDGE:
dcd44d
 	case CPU_IVYBRIDGE:
dcd44d
+	case CPU_KNIGHTSLANDING:
dcd44d
 		return CPU_ARCH_PERFMON;
dcd44d
 	default:
dcd44d
 		/* assume processor in a class by itself */
dcd44d
diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h
dcd44d
index 39b7726..e2f8f60 100644
dcd44d
--- a/libop/op_cpu_type.h
dcd44d
+++ b/libop/op_cpu_type.h
dcd44d
@@ -106,6 +106,7 @@ typedef enum {
dcd44d
 	CPU_SKYLAKE, /** < Intel Skylake microarchitecture */
dcd44d
 	CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */
dcd44d
 	CPU_PPC64_POWER9, /**< ppc64 POWER8 family */
dcd44d
+	CPU_KNIGHTSLANDING, /** Intel Knights Landing microarchitecture */
dcd44d
 	MAX_CPU_TYPE
dcd44d
 } op_cpu;
dcd44d
 
dcd44d
diff --git a/libop/op_events.c b/libop/op_events.c
dcd44d
index 0ba57e0..acadaa7 100644
dcd44d
--- a/libop/op_events.c
dcd44d
+++ b/libop/op_events.c
dcd44d
@@ -1204,6 +1204,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr)
dcd44d
 		case CPU_WESTMERE:
dcd44d
 		case CPU_SANDYBRIDGE:
dcd44d
 		case CPU_IVYBRIDGE:
dcd44d
+		case CPU_KNIGHTSLANDING:
dcd44d
 		case CPU_MIPS_LOONGSON2:
dcd44d
 		case CPU_FAMILY12H:
dcd44d
 		case CPU_FAMILY14H:
dcd44d
diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h
dcd44d
index 2061760..840e7b3 100644
dcd44d
--- a/libop/op_hw_specific.h
dcd44d
+++ b/libop/op_hw_specific.h
dcd44d
@@ -167,6 +167,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type)
dcd44d
 		case 0x5c:
dcd44d
 		case 0x5f:
dcd44d
 			return CPU_GOLDMONT;
dcd44d
+		case 0x57:
dcd44d
+		case 0x85:
dcd44d
+			return CPU_KNIGHTSLANDING;
dcd44d
 		}
dcd44d
 	}
dcd44d
 	return cpu_type;
dcd44d
diff --git a/utils/ophelp.c b/utils/ophelp.c
dcd44d
index 6eb299c..f76bf2a 100644
dcd44d
--- a/utils/ophelp.c
dcd44d
+++ b/utils/ophelp.c
dcd44d
@@ -554,6 +554,12 @@ int main(int argc, char const * argv[])
dcd44d
 			"Intel Architecture Optimization Reference Manual\n\n";
dcd44d
 		break;
dcd44d
 
dcd44d
+	case CPU_KNIGHTSLANDING:
dcd44d
+		event_doc =
dcd44d
+			"See Intel Xeon Phi(TM) Processor Performance Monitoring Reference and\n"
dcd44d
+			"Intel Architecture Optimization Reference Manual\n\n";
dcd44d
+		break;
dcd44d
+
dcd44d
 	case CPU_ARCH_PERFMON:
dcd44d
 		event_doc =
dcd44d
 			"See Intel 64 and IA-32 Architectures Software Developer's Manual\n"