Blob Blame History Raw
commit 8050eb1d3095cc3b1e7a3344c85be4d9c591c089
Author: Michael Petlan <mpetlan@redhat.com>
Date:   Tue May 16 23:21:47 2017 +0200

    oprofile: Add support for Intel Xeon Phi (Knights Landing)
    
    Adds support for Intel Xeon Phi (Knights Landing and Knights Mill)
    processors to oprofile. Only core events are supported.
    
    The events/umasks configuration has been taken from Intel Xeon Phi
    Processor Performance Monitoring Reference Manual, volume 2. All
    the events were tested on a Knights Mill machine.
    
    Signed-off-by: Michael Petlan <mpetlan@redhat.com>

diff --git a/events/Makefile.am b/events/Makefile.am
index b8f06af..13d063a 100644
--- a/events/Makefile.am
+++ b/events/Makefile.am
@@ -21,6 +21,7 @@ event_files = \
 	i386/skylake/events i386/skylake/unit_masks \
 	i386/silvermont/events i386/silvermont/unit_masks \
 	i386/goldmont/events i386/goldmont/unit_masks \
+	i386/knightslanding/events i386/knightslanding/unit_masks \
 	ia64/ia64/events ia64/ia64/unit_masks \
 	ia64/itanium2/events ia64/itanium2/unit_masks \
 	ia64/itanium/events ia64/itanium/unit_masks \
diff --git a/events/i386/knightslanding/events b/events/i386/knightslanding/events
new file mode 100644
index 0000000..d34feca
--- /dev/null
+++ b/events/i386/knightslanding/events
@@ -0,0 +1,26 @@
+#
+# Intel "Knights Landing" microarchitecture core events.
+#
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
+#
+# Note the minimum counts are not discovered experimentally and could be likely
+# lowered in many cases without ill effect.
+#
+include:i386/arch_perfmon
+event:0x03 counters:cpuid um:recycleq minimum:20000 name:recycleq : Counts the number of retired load or store micro-ops that get pushed into the Recycle Queue
+event:0x04 counters:cpuid um:mem_uops_retired minimum:100000 name:mem_uops_retired : Counts the number of memory micro-ops retired.
+event:0x05 counters:cpuid um:page_walks minimum:500 name:page_walks : Counts the number of core cycles for page walks
+event:0x30 counters:cpuid um:l2_requests_reject minimum:500 name:l2_requests_reject : Counts the number of MEC requests from the L2Q that reference a cache line were rejected.
+event:0x31 counters:cpuid um:core_reject_l2q minimum:100 name:core_reject_l2q : Number of requests not accepted into the L2Q because of any L2 queue reject condition.
+event:0x80 counters:cpuid um:icache minimum:100000 name:icache : Instruction fetches
+event:0x86 counters:cpuid um:fetch_stall minimum:100000 name:fetch_stall : Counts the number of core cycles the instruction fetch pipe was stalls
+event:0x2e counters:cpuid um:l2_requests minimum:10000 name:l2_requests : L2 cache requests
+event:0xc2 counters:cpuid um:uops_retired minimum:100000 name:uops_retired : Retired uops
+event:0xc3 counters:cpuid um:machine_clears minimum:500 name:machine_clears : Counts the number of times that the machine clears at retire.
+event:0xc4 counters:cpuid um:br_inst_retired minimum:50000 name:br_inst_retired : Counts the number of branch instructions retired
+event:0xc5 counters:cpuid um:br_misp_retired minimum:5000 name:br_misp_retired : Counts the number of mispredicted branch instructions retired
+event:0xca counters:cpuid um:no_alloc_cycles minimum:500000 name:no_alloc_cycles : Counts the number of core cycles when no micro-ops are allocated
+event:0xcb counters:cpuid um:rs_full_stall minimum:100000 name:rs_full_stall : Counts the number of core cycles when the allocate stalls because the required RS is full.
+event:0xcd counters:cpuid um:cycles_div_busy minimum:1000 name:cycles_div_busy : Number of core cycles when divider is busy
+event:0xe6 counters:cpuid um:baclears minimum:10000 name:baclears : Counts the number of times Branch Target Buffer (BTB) prediction was corrected by a later branch predictor
+event:0xe7 counters:cpuid um:ms_decoded minimum:10000 name:ms_decoded : Microcode sequencer decode entrypoints
diff --git a/events/i386/knightslanding/unit_masks b/events/i386/knightslanding/unit_masks
new file mode 100644
index 0000000..b0e7910
--- /dev/null
+++ b/events/i386/knightslanding/unit_masks
@@ -0,0 +1,91 @@
+#
+# Unit masks for the Intel "Knights Landing" micro architecture
+#
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
+#
+include:i386/arch_perfmon
+name:recycleq type:exclusive default:any_ld
+	0x01 extra:pebs ld_block_st_forward Counts the number of occurrences a retired load gets blocked because its address partially overlaps with a store.
+	0x02 extra: ld_block_std_notready Counts the number of occurrences a retired load gets blocked because its address overlaps with a store whose data is not ready.
+	0x04 extra: st_splits Counts the number of occurrences a retired store that is a cache line split. Each split should be counted only once.
+	0x08 extra:pebs ld_splits Counts the number of occurrences a retired load that is a cache line split. Each split should be counted only once.
+	0x10 extra: lock Counts all the retired locked loads. It does not include stores because we would double count if we count stores.
+	0x20 extra: sta_full Counts the store micro-ops retired that were pushed in the rehad queue because the store address buffer is full.
+	0x40 extra: any_ld Counts any retired load that was pushed into the recycle queue for any reason.
+	0x80 extra: any_st Counts any retired store that was pushed into the recycle queue for any reason.
+name:mem_uops_retired type:exclusive default:any_loads
+	0x01 extra: l1_miss_loads Counts the number of load micro-ops retired that miss in L1 D cache.
+	0x02 extra:pebs l2_hit_loads Counts the number of load micro-ops retired that hit in the L2.
+	0x04 extra:pebs l2_miss_loads Counts the number of load micro-ops retired that miss in the L2.
+	0x08 extra:pebs dtlb_miss_loads Counts the number of load micro-ops retired that cause a DTLB miss.
+	0x10 extra: utlb_miss_loads Counts the number of load micro-ops retired that caused micro TLB miss.
+	0x20 extra:pebs hitm Counts the loads retired that get the data from the other core in the same tile in M state.
+	0x40 extra: any_loads Counts all the load micro-ops retired.
+	0x80 extra: any_stores Counts all the store micro-ops retired.
+name:page_walks type:exclusive default:walks
+	0x01 extra:edge d_side_walks Counts the total D-side page walks that are completed or started. The page walks started in the speculative path will also be counted.
+	0x01 extra: d_side_cycles Counts the total number of core cycles for all the D-side page walks. The cycles for page walks started in speculative path will also be included.
+	0x02 extra:edge i_side_walks Counts the total I-side page walks that are completed.
+	0x02 extra: i_side_cycles Counts the total number of core cycles for all the I-side page walks. The cycles for page walks started in speculative path will also be included.
+	0x03 extra:edge walks Counts the total page walks completed (I-side and D-side)
+	0x03 extra: cycles Counts the total number of core cycles for all the page walks. The cycles for page walks started in speculative path will also be included.
+name:l2_requests_reject type:mandatory default:all
+	0x00 extra: all Counts the number of MEC requests from the L2Q that reference a cache line excluding SW prefetches filling only to L2 cache and L1 evictions (automatically exlcudes L2HWP, UC, WC) that were rejected - Multiple repeated rejects should be counted multiple times.
+name:core_reject_l2q type:mandatory default:all
+	0x00 extra: all  Counts the number of MEC requests that were not accepted into the L2Q because of any L2  queue reject condition. There is no concept of at-ret here. It might include requests due to instructions in the speculative path
+name:icache type:exclusive default:accesses
+	0x3 extra: accesses All instruction fetches including uncacheable
+	0x1 extra: hits All instruction fetches that hit instruction cache
+	0x2 extra: misses All instruction fetches that missed instruction cache (produced a memory request); counted only once, not once per outstanding cycle
+name:fetch_stall type:exclusive default:icache_fill_pending_cycles
+	0x01 extra: icache_fill_pending_cycles Counts the number of core cycles the fetch stalls because of an icache miss. This is a cumulative count of core cycles the fetch stalled for all icache misses
+	0x01 extra:edge icache_fill_pending_edge Counts the number of times it happens that fetch stalls because of an icache miss.
+name:l2_requests type:exclusive default:reference
+	0x41 extra: miss Counts the total number of L2 cache misses.
+	0x4f extra: reference Counts the total number of L2 cache references.
+name:uops_retired type:exclusive default:all
+	0x01 extra: ms Counts the number of uops retired that are from complex flows issued by the micro-sequencer
+	0x10 extra: all Counts the number of uops retired
+	0x20 extra: scalar_simd Counts the number of scalar SSE, AVX, AVX2, AVX-512 micro-ops except for loads (memory-to-register mov-type micro ops), division, sqrt.
+	0x40 extra: packed_simd Counts the number of packed SSE, AVX, AVX2, AVX-512 micro-ops (both floating point and integer) except for loads (memory-to-register mov-type micro-ops), packed byte and word multiplies.
+name:machine_clears type:exclusive default:all
+	0x01 extra: smc Counts the number of times that the machine clears due to program modifying data within 1K of a recently fetched code page.
+	0x02 extra: memory_ordering Counts the number of times the machine clears due to memory ordering hazards.
+	0x04 extra: fp_assist Counts the number of floating operations retired that required microcode assists
+	0x08 extra: all Counts all machine clears
+name:br_inst_retired type:exclusive default:any
+	0x00 extra:pebs any Counts the number of branch instructions retired
+	0x7e extra:pebs jcc Counts the number of branch instructions retired that were conditional jumps.
+	0xfe extra:pebs taken_jcc Counts the number of branch instructions retired that were conditional jumps and predicted taken.
+	0xf9 extra:pebs call Counts the number of near CALL branch instructions retired.
+	0xfd extra:pebs rel_call Counts the number of near relative CALL branch instructions retired.
+	0xfb extra:pebs ind_call Counts the number of near indirect CALL branch instructions retired.
+	0xf7 extra:pebs return Counts the number of near RET branch instructions retired.
+	0xeb extra:pebs non_return_ind Counts the number of branch instructions retired that were near indirect CALL or near indirect JMP.
+	0xbf extra:pebs far_branch Counts the number of far branch instructions retired.
+name:br_misp_retired type:exclusive default:any
+	0x00 extra:pebs any All mispredicted branches
+	0x7e extra:pebs jcc Number of mispredicted conditional branch instructions retired
+	0xfe extra:pebs taken_jcc Number of mispredicted taken conditional branch instructions retired
+	0xf9 extra:pebs call Counts the number of mispredicted near CALL branch instructions retired.
+	0xfd extra:pebs rel_call Counts the number of mispredicted near relative CALL branch instructions retired.
+	0xfb extra:pebs ind_call Number of mispredicted indirect call branch instructions retired
+	0xf7 extra:pebs return Number of mispredicted return branch instructions retired
+	0xeb extra:pebs non_return_ind Number of mispredicted non-return branch instructions retired
+	0xbf extra:pebs far_branch Counts the number of mispredicted far branch instructions retired.
+name:no_alloc_cycles type:exclusive default:all
+	0x01 extra: rob_full Counts the number of core cycles when no micro-ops are allocated and the ROB is full
+	0x02 extra: mispredicts Counts the number of core cycles when no micro-ops are allocated and the alloc pipe is stalled waiting for a mispredicted branch to retire.
+	0x20 extra: rat_stall Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted.
+	0x7f extra: all Counts the total number of core cycles when no micro-ops are allocated for any reason.
+name:rs_full_stall type:exclusive default:all
+	0x01 extra: mec Counts the number of core cycles when allocation pipeline is stalled and is waiting for a free MEC reservation station entry.
+	0x1f extra: all Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full.
+name:cycles_div_busy type:mandatory default:all
+	0x01 extra: all Cycles the number of core cycles when divider is busy, does not imply a stall waiting for the divider
+name:baclears type:exclusive default:all
+	0x01 extra: all Counts the number of times front-end resteers for any branch as a result of another branch handling mechanism in the front-end.
+	0x08 extra: return Counts the number of times the front-end resteers for RET branches as a result of another branch handling mechanism in the front-end.
+	0x10 extra: cond Counts the number of times the front-end resteers for conditional branches as a result of another branch handling mechanism in the front-end.
+name:ms_decoded type:mandatory default:ms_entry
+	0x01 extra: ms_entry Counts the number of times the MSROM starts a flow of uops.
diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c
index 7acecda..610121e 100644
--- a/libop/op_cpu_type.c
+++ b/libop/op_cpu_type.c
@@ -120,6 +120,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = {
 	{ "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 },
 	{ "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 },
 	{ "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 },
+	{ "Intel Knights Landing", "i386/knightslanding", CPU_KNIGHTSLANDING, 4 },
 };
  
 static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr);
@@ -752,6 +753,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type)
 	case CPU_WESTMERE:
 	case CPU_SANDYBRIDGE:
 	case CPU_IVYBRIDGE:
+	case CPU_KNIGHTSLANDING:
 		return CPU_ARCH_PERFMON;
 	default:
 		/* assume processor in a class by itself */
diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h
index 39b7726..e2f8f60 100644
--- a/libop/op_cpu_type.h
+++ b/libop/op_cpu_type.h
@@ -106,6 +106,7 @@ typedef enum {
 	CPU_SKYLAKE, /** < Intel Skylake microarchitecture */
 	CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */
 	CPU_PPC64_POWER9, /**< ppc64 POWER8 family */
+	CPU_KNIGHTSLANDING, /** Intel Knights Landing microarchitecture */
 	MAX_CPU_TYPE
 } op_cpu;
 
diff --git a/libop/op_events.c b/libop/op_events.c
index 0ba57e0..acadaa7 100644
--- a/libop/op_events.c
+++ b/libop/op_events.c
@@ -1204,6 +1204,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr)
 		case CPU_WESTMERE:
 		case CPU_SANDYBRIDGE:
 		case CPU_IVYBRIDGE:
+		case CPU_KNIGHTSLANDING:
 		case CPU_MIPS_LOONGSON2:
 		case CPU_FAMILY12H:
 		case CPU_FAMILY14H:
diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h
index 2061760..840e7b3 100644
--- a/libop/op_hw_specific.h
+++ b/libop/op_hw_specific.h
@@ -167,6 +167,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type)
 		case 0x5c:
 		case 0x5f:
 			return CPU_GOLDMONT;
+		case 0x57:
+		case 0x85:
+			return CPU_KNIGHTSLANDING;
 		}
 	}
 	return cpu_type;
diff --git a/utils/ophelp.c b/utils/ophelp.c
index 6eb299c..f76bf2a 100644
--- a/utils/ophelp.c
+++ b/utils/ophelp.c
@@ -554,6 +554,12 @@ int main(int argc, char const * argv[])
 			"Intel Architecture Optimization Reference Manual\n\n";
 		break;
 
+	case CPU_KNIGHTSLANDING:
+		event_doc =
+			"See Intel Xeon Phi(TM) Processor Performance Monitoring Reference and\n"
+			"Intel Architecture Optimization Reference Manual\n\n";
+		break;
+
 	case CPU_ARCH_PERFMON:
 		event_doc =
 			"See Intel 64 and IA-32 Architectures Software Developer's Manual\n"