Blame SOURCES/oprofile-knl.patch

b527cc
commit 8050eb1d3095cc3b1e7a3344c85be4d9c591c089
b527cc
Author: Michael Petlan <mpetlan@redhat.com>
b527cc
Date:   Tue May 16 23:21:47 2017 +0200
b527cc
b527cc
    oprofile: Add support for Intel Xeon Phi (Knights Landing)
b527cc
    
b527cc
    Adds support for Intel Xeon Phi (Knights Landing and Knights Mill)
b527cc
    processors to oprofile. Only core events are supported.
b527cc
    
b527cc
    The events/umasks configuration has been taken from Intel Xeon Phi
b527cc
    Processor Performance Monitoring Reference Manual, volume 2. All
b527cc
    the events were tested on a Knights Mill machine.
b527cc
    
b527cc
    Signed-off-by: Michael Petlan <mpetlan@redhat.com>
b527cc
b527cc
diff --git a/events/Makefile.am b/events/Makefile.am
b527cc
index b8f06af..13d063a 100644
b527cc
--- a/events/Makefile.am
b527cc
+++ b/events/Makefile.am
b527cc
@@ -21,6 +21,7 @@ event_files = \
b527cc
 	i386/skylake/events i386/skylake/unit_masks \
b527cc
 	i386/silvermont/events i386/silvermont/unit_masks \
b527cc
 	i386/goldmont/events i386/goldmont/unit_masks \
b527cc
+	i386/knightslanding/events i386/knightslanding/unit_masks \
b527cc
 	ia64/ia64/events ia64/ia64/unit_masks \
b527cc
 	ia64/itanium2/events ia64/itanium2/unit_masks \
b527cc
 	ia64/itanium/events ia64/itanium/unit_masks \
b527cc
diff --git a/events/i386/knightslanding/events b/events/i386/knightslanding/events
b527cc
new file mode 100644
b527cc
index 0000000..d34feca
b527cc
--- /dev/null
b527cc
+++ b/events/i386/knightslanding/events
b527cc
@@ -0,0 +1,26 @@
b527cc
+#
b527cc
+# Intel "Knights Landing" microarchitecture core events.
b527cc
+#
b527cc
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
b527cc
+#
b527cc
+# Note the minimum counts are not discovered experimentally and could be likely
b527cc
+# lowered in many cases without ill effect.
b527cc
+#
b527cc
+include:i386/arch_perfmon
b527cc
+event:0x03 counters:cpuid um:recycleq minimum:20000 name:recycleq : Counts the number of retired load or store micro-ops that get pushed into the Recycle Queue
b527cc
+event:0x04 counters:cpuid um:mem_uops_retired minimum:100000 name:mem_uops_retired : Counts the number of memory micro-ops retired.
b527cc
+event:0x05 counters:cpuid um:page_walks minimum:500 name:page_walks : Counts the number of core cycles for page walks
b527cc
+event:0x30 counters:cpuid um:l2_requests_reject minimum:500 name:l2_requests_reject : Counts the number of MEC requests from the L2Q that reference a cache line were rejected.
b527cc
+event:0x31 counters:cpuid um:core_reject_l2q minimum:100 name:core_reject_l2q : Number of requests not accepted into the L2Q because of any L2 queue reject condition.
b527cc
+event:0x80 counters:cpuid um:icache minimum:100000 name:icache : Instruction fetches
b527cc
+event:0x86 counters:cpuid um:fetch_stall minimum:100000 name:fetch_stall : Counts the number of core cycles the instruction fetch pipe was stalls
b527cc
+event:0x2e counters:cpuid um:l2_requests minimum:10000 name:l2_requests : L2 cache requests
b527cc
+event:0xc2 counters:cpuid um:uops_retired minimum:100000 name:uops_retired : Retired uops
b527cc
+event:0xc3 counters:cpuid um:machine_clears minimum:500 name:machine_clears : Counts the number of times that the machine clears at retire.
b527cc
+event:0xc4 counters:cpuid um:br_inst_retired minimum:50000 name:br_inst_retired : Counts the number of branch instructions retired
b527cc
+event:0xc5 counters:cpuid um:br_misp_retired minimum:5000 name:br_misp_retired : Counts the number of mispredicted branch instructions retired
b527cc
+event:0xca counters:cpuid um:no_alloc_cycles minimum:500000 name:no_alloc_cycles : Counts the number of core cycles when no micro-ops are allocated
b527cc
+event:0xcb counters:cpuid um:rs_full_stall minimum:100000 name:rs_full_stall : Counts the number of core cycles when the allocate stalls because the required RS is full.
b527cc
+event:0xcd counters:cpuid um:cycles_div_busy minimum:1000 name:cycles_div_busy : Number of core cycles when divider is busy
b527cc
+event:0xe6 counters:cpuid um:baclears minimum:10000 name:baclears : Counts the number of times Branch Target Buffer (BTB) prediction was corrected by a later branch predictor
b527cc
+event:0xe7 counters:cpuid um:ms_decoded minimum:10000 name:ms_decoded : Microcode sequencer decode entrypoints
b527cc
diff --git a/events/i386/knightslanding/unit_masks b/events/i386/knightslanding/unit_masks
b527cc
new file mode 100644
b527cc
index 0000000..b0e7910
b527cc
--- /dev/null
b527cc
+++ b/events/i386/knightslanding/unit_masks
b527cc
@@ -0,0 +1,91 @@
b527cc
+#
b527cc
+# Unit masks for the Intel "Knights Landing" micro architecture
b527cc
+#
b527cc
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
b527cc
+#
b527cc
+include:i386/arch_perfmon
b527cc
+name:recycleq type:exclusive default:any_ld
b527cc
+	0x01 extra:pebs ld_block_st_forward Counts the number of occurrences a retired load gets blocked because its address partially overlaps with a store.
b527cc
+	0x02 extra: ld_block_std_notready Counts the number of occurrences a retired load gets blocked because its address overlaps with a store whose data is not ready.
b527cc
+	0x04 extra: st_splits Counts the number of occurrences a retired store that is a cache line split. Each split should be counted only once.
b527cc
+	0x08 extra:pebs ld_splits Counts the number of occurrences a retired load that is a cache line split. Each split should be counted only once.
b527cc
+	0x10 extra: lock Counts all the retired locked loads. It does not include stores because we would double count if we count stores.
b527cc
+	0x20 extra: sta_full Counts the store micro-ops retired that were pushed in the rehad queue because the store address buffer is full.
b527cc
+	0x40 extra: any_ld Counts any retired load that was pushed into the recycle queue for any reason.
b527cc
+	0x80 extra: any_st Counts any retired store that was pushed into the recycle queue for any reason.
b527cc
+name:mem_uops_retired type:exclusive default:any_loads
b527cc
+	0x01 extra: l1_miss_loads Counts the number of load micro-ops retired that miss in L1 D cache.
b527cc
+	0x02 extra:pebs l2_hit_loads Counts the number of load micro-ops retired that hit in the L2.
b527cc
+	0x04 extra:pebs l2_miss_loads Counts the number of load micro-ops retired that miss in the L2.
b527cc
+	0x08 extra:pebs dtlb_miss_loads Counts the number of load micro-ops retired that cause a DTLB miss.
b527cc
+	0x10 extra: utlb_miss_loads Counts the number of load micro-ops retired that caused micro TLB miss.
b527cc
+	0x20 extra:pebs hitm Counts the loads retired that get the data from the other core in the same tile in M state.
b527cc
+	0x40 extra: any_loads Counts all the load micro-ops retired.
b527cc
+	0x80 extra: any_stores Counts all the store micro-ops retired.
b527cc
+name:page_walks type:exclusive default:walks
b527cc
+	0x01 extra:edge d_side_walks Counts the total D-side page walks that are completed or started. The page walks started in the speculative path will also be counted.
b527cc
+	0x01 extra: d_side_cycles Counts the total number of core cycles for all the D-side page walks. The cycles for page walks started in speculative path will also be included.
b527cc
+	0x02 extra:edge i_side_walks Counts the total I-side page walks that are completed.
b527cc
+	0x02 extra: i_side_cycles Counts the total number of core cycles for all the I-side page walks. The cycles for page walks started in speculative path will also be included.
b527cc
+	0x03 extra:edge walks Counts the total page walks completed (I-side and D-side)
b527cc
+	0x03 extra: cycles Counts the total number of core cycles for all the page walks. The cycles for page walks started in speculative path will also be included.
b527cc
+name:l2_requests_reject type:mandatory default:all
b527cc
+	0x00 extra: all Counts the number of MEC requests from the L2Q that reference a cache line excluding SW prefetches filling only to L2 cache and L1 evictions (automatically exlcudes L2HWP, UC, WC) that were rejected - Multiple repeated rejects should be counted multiple times.
b527cc
+name:core_reject_l2q type:mandatory default:all
b527cc
+	0x00 extra: all  Counts the number of MEC requests that were not accepted into the L2Q because of any L2  queue reject condition. There is no concept of at-ret here. It might include requests due to instructions in the speculative path
b527cc
+name:icache type:exclusive default:accesses
b527cc
+	0x3 extra: accesses All instruction fetches including uncacheable
b527cc
+	0x1 extra: hits All instruction fetches that hit instruction cache
b527cc
+	0x2 extra: misses All instruction fetches that missed instruction cache (produced a memory request); counted only once, not once per outstanding cycle
b527cc
+name:fetch_stall type:exclusive default:icache_fill_pending_cycles
b527cc
+	0x01 extra: icache_fill_pending_cycles Counts the number of core cycles the fetch stalls because of an icache miss. This is a cumulative count of core cycles the fetch stalled for all icache misses
b527cc
+	0x01 extra:edge icache_fill_pending_edge Counts the number of times it happens that fetch stalls because of an icache miss.
b527cc
+name:l2_requests type:exclusive default:reference
b527cc
+	0x41 extra: miss Counts the total number of L2 cache misses.
b527cc
+	0x4f extra: reference Counts the total number of L2 cache references.
b527cc
+name:uops_retired type:exclusive default:all
b527cc
+	0x01 extra: ms Counts the number of uops retired that are from complex flows issued by the micro-sequencer
b527cc
+	0x10 extra: all Counts the number of uops retired
b527cc
+	0x20 extra: scalar_simd Counts the number of scalar SSE, AVX, AVX2, AVX-512 micro-ops except for loads (memory-to-register mov-type micro ops), division, sqrt.
b527cc
+	0x40 extra: packed_simd Counts the number of packed SSE, AVX, AVX2, AVX-512 micro-ops (both floating point and integer) except for loads (memory-to-register mov-type micro-ops), packed byte and word multiplies.
b527cc
+name:machine_clears type:exclusive default:all
b527cc
+	0x01 extra: smc Counts the number of times that the machine clears due to program modifying data within 1K of a recently fetched code page.
b527cc
+	0x02 extra: memory_ordering Counts the number of times the machine clears due to memory ordering hazards.
b527cc
+	0x04 extra: fp_assist Counts the number of floating operations retired that required microcode assists
b527cc
+	0x08 extra: all Counts all machine clears
b527cc
+name:br_inst_retired type:exclusive default:any
b527cc
+	0x00 extra:pebs any Counts the number of branch instructions retired
b527cc
+	0x7e extra:pebs jcc Counts the number of branch instructions retired that were conditional jumps.
b527cc
+	0xfe extra:pebs taken_jcc Counts the number of branch instructions retired that were conditional jumps and predicted taken.
b527cc
+	0xf9 extra:pebs call Counts the number of near CALL branch instructions retired.
b527cc
+	0xfd extra:pebs rel_call Counts the number of near relative CALL branch instructions retired.
b527cc
+	0xfb extra:pebs ind_call Counts the number of near indirect CALL branch instructions retired.
b527cc
+	0xf7 extra:pebs return Counts the number of near RET branch instructions retired.
b527cc
+	0xeb extra:pebs non_return_ind Counts the number of branch instructions retired that were near indirect CALL or near indirect JMP.
b527cc
+	0xbf extra:pebs far_branch Counts the number of far branch instructions retired.
b527cc
+name:br_misp_retired type:exclusive default:any
b527cc
+	0x00 extra:pebs any All mispredicted branches
b527cc
+	0x7e extra:pebs jcc Number of mispredicted conditional branch instructions retired
b527cc
+	0xfe extra:pebs taken_jcc Number of mispredicted taken conditional branch instructions retired
b527cc
+	0xf9 extra:pebs call Counts the number of mispredicted near CALL branch instructions retired.
b527cc
+	0xfd extra:pebs rel_call Counts the number of mispredicted near relative CALL branch instructions retired.
b527cc
+	0xfb extra:pebs ind_call Number of mispredicted indirect call branch instructions retired
b527cc
+	0xf7 extra:pebs return Number of mispredicted return branch instructions retired
b527cc
+	0xeb extra:pebs non_return_ind Number of mispredicted non-return branch instructions retired
b527cc
+	0xbf extra:pebs far_branch Counts the number of mispredicted far branch instructions retired.
b527cc
+name:no_alloc_cycles type:exclusive default:all
b527cc
+	0x01 extra: rob_full Counts the number of core cycles when no micro-ops are allocated and the ROB is full
b527cc
+	0x02 extra: mispredicts Counts the number of core cycles when no micro-ops are allocated and the alloc pipe is stalled waiting for a mispredicted branch to retire.
b527cc
+	0x20 extra: rat_stall Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted.
b527cc
+	0x7f extra: all Counts the total number of core cycles when no micro-ops are allocated for any reason.
b527cc
+name:rs_full_stall type:exclusive default:all
b527cc
+	0x01 extra: mec Counts the number of core cycles when allocation pipeline is stalled and is waiting for a free MEC reservation station entry.
b527cc
+	0x1f extra: all Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full.
b527cc
+name:cycles_div_busy type:mandatory default:all
b527cc
+	0x01 extra: all Cycles the number of core cycles when divider is busy, does not imply a stall waiting for the divider
b527cc
+name:baclears type:exclusive default:all
b527cc
+	0x01 extra: all Counts the number of times front-end resteers for any branch as a result of another branch handling mechanism in the front-end.
b527cc
+	0x08 extra: return Counts the number of times the front-end resteers for RET branches as a result of another branch handling mechanism in the front-end.
b527cc
+	0x10 extra: cond Counts the number of times the front-end resteers for conditional branches as a result of another branch handling mechanism in the front-end.
b527cc
+name:ms_decoded type:mandatory default:ms_entry
b527cc
+	0x01 extra: ms_entry Counts the number of times the MSROM starts a flow of uops.
b527cc
diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c
b527cc
index 7acecda..610121e 100644
b527cc
--- a/libop/op_cpu_type.c
b527cc
+++ b/libop/op_cpu_type.c
b527cc
@@ -120,6 +120,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = {
b527cc
 	{ "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 },
b527cc
 	{ "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 },
b527cc
 	{ "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 },
b527cc
+	{ "Intel Knights Landing", "i386/knightslanding", CPU_KNIGHTSLANDING, 4 },
b527cc
 };
b527cc
  
b527cc
 static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr);
b527cc
@@ -752,6 +753,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type)
b527cc
 	case CPU_WESTMERE:
b527cc
 	case CPU_SANDYBRIDGE:
b527cc
 	case CPU_IVYBRIDGE:
b527cc
+	case CPU_KNIGHTSLANDING:
b527cc
 		return CPU_ARCH_PERFMON;
b527cc
 	default:
b527cc
 		/* assume processor in a class by itself */
b527cc
diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h
b527cc
index 39b7726..e2f8f60 100644
b527cc
--- a/libop/op_cpu_type.h
b527cc
+++ b/libop/op_cpu_type.h
b527cc
@@ -106,6 +106,7 @@ typedef enum {
b527cc
 	CPU_SKYLAKE, /** < Intel Skylake microarchitecture */
b527cc
 	CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */
b527cc
 	CPU_PPC64_POWER9, /**< ppc64 POWER8 family */
b527cc
+	CPU_KNIGHTSLANDING, /** Intel Knights Landing microarchitecture */
b527cc
 	MAX_CPU_TYPE
b527cc
 } op_cpu;
b527cc
 
b527cc
diff --git a/libop/op_events.c b/libop/op_events.c
b527cc
index 0ba57e0..acadaa7 100644
b527cc
--- a/libop/op_events.c
b527cc
+++ b/libop/op_events.c
b527cc
@@ -1204,6 +1204,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr)
b527cc
 		case CPU_WESTMERE:
b527cc
 		case CPU_SANDYBRIDGE:
b527cc
 		case CPU_IVYBRIDGE:
b527cc
+		case CPU_KNIGHTSLANDING:
b527cc
 		case CPU_MIPS_LOONGSON2:
b527cc
 		case CPU_FAMILY12H:
b527cc
 		case CPU_FAMILY14H:
b527cc
diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h
b527cc
index 2061760..840e7b3 100644
b527cc
--- a/libop/op_hw_specific.h
b527cc
+++ b/libop/op_hw_specific.h
b527cc
@@ -167,6 +167,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type)
b527cc
 		case 0x5c:
b527cc
 		case 0x5f:
b527cc
 			return CPU_GOLDMONT;
b527cc
+		case 0x57:
b527cc
+		case 0x85:
b527cc
+			return CPU_KNIGHTSLANDING;
b527cc
 		}
b527cc
 	}
b527cc
 	return cpu_type;
b527cc
diff --git a/utils/ophelp.c b/utils/ophelp.c
b527cc
index 6eb299c..f76bf2a 100644
b527cc
--- a/utils/ophelp.c
b527cc
+++ b/utils/ophelp.c
b527cc
@@ -554,6 +554,12 @@ int main(int argc, char const * argv[])
b527cc
 			"Intel Architecture Optimization Reference Manual\n\n";
b527cc
 		break;
b527cc
 
b527cc
+	case CPU_KNIGHTSLANDING:
b527cc
+		event_doc =
b527cc
+			"See Intel Xeon Phi(TM) Processor Performance Monitoring Reference and\n"
b527cc
+			"Intel Architecture Optimization Reference Manual\n\n";
b527cc
+		break;
b527cc
+
b527cc
 	case CPU_ARCH_PERFMON:
b527cc
 		event_doc =
b527cc
 			"See Intel 64 and IA-32 Architectures Software Developer's Manual\n"