|
|
dcd44d |
commit 8050eb1d3095cc3b1e7a3344c85be4d9c591c089
|
|
|
dcd44d |
Author: Michael Petlan <mpetlan@redhat.com>
|
|
|
dcd44d |
Date: Tue May 16 23:21:47 2017 +0200
|
|
|
dcd44d |
|
|
|
dcd44d |
oprofile: Add support for Intel Xeon Phi (Knights Landing)
|
|
|
dcd44d |
|
|
|
dcd44d |
Adds support for Intel Xeon Phi (Knights Landing and Knights Mill)
|
|
|
dcd44d |
processors to oprofile. Only core events are supported.
|
|
|
dcd44d |
|
|
|
dcd44d |
The events/umasks configuration has been taken from Intel Xeon Phi
|
|
|
dcd44d |
Processor Performance Monitoring Reference Manual, volume 2. All
|
|
|
dcd44d |
the events were tested on a Knights Mill machine.
|
|
|
dcd44d |
|
|
|
dcd44d |
Signed-off-by: Michael Petlan <mpetlan@redhat.com>
|
|
|
dcd44d |
|
|
|
dcd44d |
diff --git a/events/Makefile.am b/events/Makefile.am
|
|
|
dcd44d |
index b8f06af..13d063a 100644
|
|
|
dcd44d |
--- a/events/Makefile.am
|
|
|
dcd44d |
+++ b/events/Makefile.am
|
|
|
dcd44d |
@@ -21,6 +21,7 @@ event_files = \
|
|
|
dcd44d |
i386/skylake/events i386/skylake/unit_masks \
|
|
|
dcd44d |
i386/silvermont/events i386/silvermont/unit_masks \
|
|
|
dcd44d |
i386/goldmont/events i386/goldmont/unit_masks \
|
|
|
dcd44d |
+ i386/knightslanding/events i386/knightslanding/unit_masks \
|
|
|
dcd44d |
ia64/ia64/events ia64/ia64/unit_masks \
|
|
|
dcd44d |
ia64/itanium2/events ia64/itanium2/unit_masks \
|
|
|
dcd44d |
ia64/itanium/events ia64/itanium/unit_masks \
|
|
|
dcd44d |
diff --git a/events/i386/knightslanding/events b/events/i386/knightslanding/events
|
|
|
dcd44d |
new file mode 100644
|
|
|
dcd44d |
index 0000000..d34feca
|
|
|
dcd44d |
--- /dev/null
|
|
|
dcd44d |
+++ b/events/i386/knightslanding/events
|
|
|
dcd44d |
@@ -0,0 +1,26 @@
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+# Intel "Knights Landing" microarchitecture core events.
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+# Note the minimum counts are not discovered experimentally and could be likely
|
|
|
dcd44d |
+# lowered in many cases without ill effect.
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+include:i386/arch_perfmon
|
|
|
dcd44d |
+event:0x03 counters:cpuid um:recycleq minimum:20000 name:recycleq : Counts the number of retired load or store micro-ops that get pushed into the Recycle Queue
|
|
|
dcd44d |
+event:0x04 counters:cpuid um:mem_uops_retired minimum:100000 name:mem_uops_retired : Counts the number of memory micro-ops retired.
|
|
|
dcd44d |
+event:0x05 counters:cpuid um:page_walks minimum:500 name:page_walks : Counts the number of core cycles for page walks
|
|
|
dcd44d |
+event:0x30 counters:cpuid um:l2_requests_reject minimum:500 name:l2_requests_reject : Counts the number of MEC requests from the L2Q that reference a cache line were rejected.
|
|
|
dcd44d |
+event:0x31 counters:cpuid um:core_reject_l2q minimum:100 name:core_reject_l2q : Number of requests not accepted into the L2Q because of any L2 queue reject condition.
|
|
|
dcd44d |
+event:0x80 counters:cpuid um:icache minimum:100000 name:icache : Instruction fetches
|
|
|
dcd44d |
+event:0x86 counters:cpuid um:fetch_stall minimum:100000 name:fetch_stall : Counts the number of core cycles the instruction fetch pipe was stalls
|
|
|
dcd44d |
+event:0x2e counters:cpuid um:l2_requests minimum:10000 name:l2_requests : L2 cache requests
|
|
|
dcd44d |
+event:0xc2 counters:cpuid um:uops_retired minimum:100000 name:uops_retired : Retired uops
|
|
|
dcd44d |
+event:0xc3 counters:cpuid um:machine_clears minimum:500 name:machine_clears : Counts the number of times that the machine clears at retire.
|
|
|
dcd44d |
+event:0xc4 counters:cpuid um:br_inst_retired minimum:50000 name:br_inst_retired : Counts the number of branch instructions retired
|
|
|
dcd44d |
+event:0xc5 counters:cpuid um:br_misp_retired minimum:5000 name:br_misp_retired : Counts the number of mispredicted branch instructions retired
|
|
|
dcd44d |
+event:0xca counters:cpuid um:no_alloc_cycles minimum:500000 name:no_alloc_cycles : Counts the number of core cycles when no micro-ops are allocated
|
|
|
dcd44d |
+event:0xcb counters:cpuid um:rs_full_stall minimum:100000 name:rs_full_stall : Counts the number of core cycles when the allocate stalls because the required RS is full.
|
|
|
dcd44d |
+event:0xcd counters:cpuid um:cycles_div_busy minimum:1000 name:cycles_div_busy : Number of core cycles when divider is busy
|
|
|
dcd44d |
+event:0xe6 counters:cpuid um:baclears minimum:10000 name:baclears : Counts the number of times Branch Target Buffer (BTB) prediction was corrected by a later branch predictor
|
|
|
dcd44d |
+event:0xe7 counters:cpuid um:ms_decoded minimum:10000 name:ms_decoded : Microcode sequencer decode entrypoints
|
|
|
dcd44d |
diff --git a/events/i386/knightslanding/unit_masks b/events/i386/knightslanding/unit_masks
|
|
|
dcd44d |
new file mode 100644
|
|
|
dcd44d |
index 0000000..b0e7910
|
|
|
dcd44d |
--- /dev/null
|
|
|
dcd44d |
+++ b/events/i386/knightslanding/unit_masks
|
|
|
dcd44d |
@@ -0,0 +1,91 @@
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+# Unit masks for the Intel "Knights Landing" micro architecture
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+# See http://ark.intel.com/ for help in identifying Knights Landing CPUs
|
|
|
dcd44d |
+#
|
|
|
dcd44d |
+include:i386/arch_perfmon
|
|
|
dcd44d |
+name:recycleq type:exclusive default:any_ld
|
|
|
dcd44d |
+ 0x01 extra:pebs ld_block_st_forward Counts the number of occurrences a retired load gets blocked because its address partially overlaps with a store.
|
|
|
dcd44d |
+ 0x02 extra: ld_block_std_notready Counts the number of occurrences a retired load gets blocked because its address overlaps with a store whose data is not ready.
|
|
|
dcd44d |
+ 0x04 extra: st_splits Counts the number of occurrences a retired store that is a cache line split. Each split should be counted only once.
|
|
|
dcd44d |
+ 0x08 extra:pebs ld_splits Counts the number of occurrences a retired load that is a cache line split. Each split should be counted only once.
|
|
|
dcd44d |
+ 0x10 extra: lock Counts all the retired locked loads. It does not include stores because we would double count if we count stores.
|
|
|
dcd44d |
+ 0x20 extra: sta_full Counts the store micro-ops retired that were pushed in the rehad queue because the store address buffer is full.
|
|
|
dcd44d |
+ 0x40 extra: any_ld Counts any retired load that was pushed into the recycle queue for any reason.
|
|
|
dcd44d |
+ 0x80 extra: any_st Counts any retired store that was pushed into the recycle queue for any reason.
|
|
|
dcd44d |
+name:mem_uops_retired type:exclusive default:any_loads
|
|
|
dcd44d |
+ 0x01 extra: l1_miss_loads Counts the number of load micro-ops retired that miss in L1 D cache.
|
|
|
dcd44d |
+ 0x02 extra:pebs l2_hit_loads Counts the number of load micro-ops retired that hit in the L2.
|
|
|
dcd44d |
+ 0x04 extra:pebs l2_miss_loads Counts the number of load micro-ops retired that miss in the L2.
|
|
|
dcd44d |
+ 0x08 extra:pebs dtlb_miss_loads Counts the number of load micro-ops retired that cause a DTLB miss.
|
|
|
dcd44d |
+ 0x10 extra: utlb_miss_loads Counts the number of load micro-ops retired that caused micro TLB miss.
|
|
|
dcd44d |
+ 0x20 extra:pebs hitm Counts the loads retired that get the data from the other core in the same tile in M state.
|
|
|
dcd44d |
+ 0x40 extra: any_loads Counts all the load micro-ops retired.
|
|
|
dcd44d |
+ 0x80 extra: any_stores Counts all the store micro-ops retired.
|
|
|
dcd44d |
+name:page_walks type:exclusive default:walks
|
|
|
dcd44d |
+ 0x01 extra:edge d_side_walks Counts the total D-side page walks that are completed or started. The page walks started in the speculative path will also be counted.
|
|
|
dcd44d |
+ 0x01 extra: d_side_cycles Counts the total number of core cycles for all the D-side page walks. The cycles for page walks started in speculative path will also be included.
|
|
|
dcd44d |
+ 0x02 extra:edge i_side_walks Counts the total I-side page walks that are completed.
|
|
|
dcd44d |
+ 0x02 extra: i_side_cycles Counts the total number of core cycles for all the I-side page walks. The cycles for page walks started in speculative path will also be included.
|
|
|
dcd44d |
+ 0x03 extra:edge walks Counts the total page walks completed (I-side and D-side)
|
|
|
dcd44d |
+ 0x03 extra: cycles Counts the total number of core cycles for all the page walks. The cycles for page walks started in speculative path will also be included.
|
|
|
dcd44d |
+name:l2_requests_reject type:mandatory default:all
|
|
|
dcd44d |
+ 0x00 extra: all Counts the number of MEC requests from the L2Q that reference a cache line excluding SW prefetches filling only to L2 cache and L1 evictions (automatically exlcudes L2HWP, UC, WC) that were rejected - Multiple repeated rejects should be counted multiple times.
|
|
|
dcd44d |
+name:core_reject_l2q type:mandatory default:all
|
|
|
dcd44d |
+ 0x00 extra: all Counts the number of MEC requests that were not accepted into the L2Q because of any L2 queue reject condition. There is no concept of at-ret here. It might include requests due to instructions in the speculative path
|
|
|
dcd44d |
+name:icache type:exclusive default:accesses
|
|
|
dcd44d |
+ 0x3 extra: accesses All instruction fetches including uncacheable
|
|
|
dcd44d |
+ 0x1 extra: hits All instruction fetches that hit instruction cache
|
|
|
dcd44d |
+ 0x2 extra: misses All instruction fetches that missed instruction cache (produced a memory request); counted only once, not once per outstanding cycle
|
|
|
dcd44d |
+name:fetch_stall type:exclusive default:icache_fill_pending_cycles
|
|
|
dcd44d |
+ 0x01 extra: icache_fill_pending_cycles Counts the number of core cycles the fetch stalls because of an icache miss. This is a cumulative count of core cycles the fetch stalled for all icache misses
|
|
|
dcd44d |
+ 0x01 extra:edge icache_fill_pending_edge Counts the number of times it happens that fetch stalls because of an icache miss.
|
|
|
dcd44d |
+name:l2_requests type:exclusive default:reference
|
|
|
dcd44d |
+ 0x41 extra: miss Counts the total number of L2 cache misses.
|
|
|
dcd44d |
+ 0x4f extra: reference Counts the total number of L2 cache references.
|
|
|
dcd44d |
+name:uops_retired type:exclusive default:all
|
|
|
dcd44d |
+ 0x01 extra: ms Counts the number of uops retired that are from complex flows issued by the micro-sequencer
|
|
|
dcd44d |
+ 0x10 extra: all Counts the number of uops retired
|
|
|
dcd44d |
+ 0x20 extra: scalar_simd Counts the number of scalar SSE, AVX, AVX2, AVX-512 micro-ops except for loads (memory-to-register mov-type micro ops), division, sqrt.
|
|
|
dcd44d |
+ 0x40 extra: packed_simd Counts the number of packed SSE, AVX, AVX2, AVX-512 micro-ops (both floating point and integer) except for loads (memory-to-register mov-type micro-ops), packed byte and word multiplies.
|
|
|
dcd44d |
+name:machine_clears type:exclusive default:all
|
|
|
dcd44d |
+ 0x01 extra: smc Counts the number of times that the machine clears due to program modifying data within 1K of a recently fetched code page.
|
|
|
dcd44d |
+ 0x02 extra: memory_ordering Counts the number of times the machine clears due to memory ordering hazards.
|
|
|
dcd44d |
+ 0x04 extra: fp_assist Counts the number of floating operations retired that required microcode assists
|
|
|
dcd44d |
+ 0x08 extra: all Counts all machine clears
|
|
|
dcd44d |
+name:br_inst_retired type:exclusive default:any
|
|
|
dcd44d |
+ 0x00 extra:pebs any Counts the number of branch instructions retired
|
|
|
dcd44d |
+ 0x7e extra:pebs jcc Counts the number of branch instructions retired that were conditional jumps.
|
|
|
dcd44d |
+ 0xfe extra:pebs taken_jcc Counts the number of branch instructions retired that were conditional jumps and predicted taken.
|
|
|
dcd44d |
+ 0xf9 extra:pebs call Counts the number of near CALL branch instructions retired.
|
|
|
dcd44d |
+ 0xfd extra:pebs rel_call Counts the number of near relative CALL branch instructions retired.
|
|
|
dcd44d |
+ 0xfb extra:pebs ind_call Counts the number of near indirect CALL branch instructions retired.
|
|
|
dcd44d |
+ 0xf7 extra:pebs return Counts the number of near RET branch instructions retired.
|
|
|
dcd44d |
+ 0xeb extra:pebs non_return_ind Counts the number of branch instructions retired that were near indirect CALL or near indirect JMP.
|
|
|
dcd44d |
+ 0xbf extra:pebs far_branch Counts the number of far branch instructions retired.
|
|
|
dcd44d |
+name:br_misp_retired type:exclusive default:any
|
|
|
dcd44d |
+ 0x00 extra:pebs any All mispredicted branches
|
|
|
dcd44d |
+ 0x7e extra:pebs jcc Number of mispredicted conditional branch instructions retired
|
|
|
dcd44d |
+ 0xfe extra:pebs taken_jcc Number of mispredicted taken conditional branch instructions retired
|
|
|
dcd44d |
+ 0xf9 extra:pebs call Counts the number of mispredicted near CALL branch instructions retired.
|
|
|
dcd44d |
+ 0xfd extra:pebs rel_call Counts the number of mispredicted near relative CALL branch instructions retired.
|
|
|
dcd44d |
+ 0xfb extra:pebs ind_call Number of mispredicted indirect call branch instructions retired
|
|
|
dcd44d |
+ 0xf7 extra:pebs return Number of mispredicted return branch instructions retired
|
|
|
dcd44d |
+ 0xeb extra:pebs non_return_ind Number of mispredicted non-return branch instructions retired
|
|
|
dcd44d |
+ 0xbf extra:pebs far_branch Counts the number of mispredicted far branch instructions retired.
|
|
|
dcd44d |
+name:no_alloc_cycles type:exclusive default:all
|
|
|
dcd44d |
+ 0x01 extra: rob_full Counts the number of core cycles when no micro-ops are allocated and the ROB is full
|
|
|
dcd44d |
+ 0x02 extra: mispredicts Counts the number of core cycles when no micro-ops are allocated and the alloc pipe is stalled waiting for a mispredicted branch to retire.
|
|
|
dcd44d |
+ 0x20 extra: rat_stall Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted.
|
|
|
dcd44d |
+ 0x7f extra: all Counts the total number of core cycles when no micro-ops are allocated for any reason.
|
|
|
dcd44d |
+name:rs_full_stall type:exclusive default:all
|
|
|
dcd44d |
+ 0x01 extra: mec Counts the number of core cycles when allocation pipeline is stalled and is waiting for a free MEC reservation station entry.
|
|
|
dcd44d |
+ 0x1f extra: all Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full.
|
|
|
dcd44d |
+name:cycles_div_busy type:mandatory default:all
|
|
|
dcd44d |
+ 0x01 extra: all Cycles the number of core cycles when divider is busy, does not imply a stall waiting for the divider
|
|
|
dcd44d |
+name:baclears type:exclusive default:all
|
|
|
dcd44d |
+ 0x01 extra: all Counts the number of times front-end resteers for any branch as a result of another branch handling mechanism in the front-end.
|
|
|
dcd44d |
+ 0x08 extra: return Counts the number of times the front-end resteers for RET branches as a result of another branch handling mechanism in the front-end.
|
|
|
dcd44d |
+ 0x10 extra: cond Counts the number of times the front-end resteers for conditional branches as a result of another branch handling mechanism in the front-end.
|
|
|
dcd44d |
+name:ms_decoded type:mandatory default:ms_entry
|
|
|
dcd44d |
+ 0x01 extra: ms_entry Counts the number of times the MSROM starts a flow of uops.
|
|
|
dcd44d |
diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c
|
|
|
dcd44d |
index 7acecda..610121e 100644
|
|
|
dcd44d |
--- a/libop/op_cpu_type.c
|
|
|
dcd44d |
+++ b/libop/op_cpu_type.c
|
|
|
dcd44d |
@@ -120,6 +120,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = {
|
|
|
dcd44d |
{ "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 },
|
|
|
dcd44d |
{ "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 },
|
|
|
dcd44d |
{ "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 },
|
|
|
dcd44d |
+ { "Intel Knights Landing", "i386/knightslanding", CPU_KNIGHTSLANDING, 4 },
|
|
|
dcd44d |
};
|
|
|
dcd44d |
|
|
|
dcd44d |
static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr);
|
|
|
dcd44d |
@@ -752,6 +753,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type)
|
|
|
dcd44d |
case CPU_WESTMERE:
|
|
|
dcd44d |
case CPU_SANDYBRIDGE:
|
|
|
dcd44d |
case CPU_IVYBRIDGE:
|
|
|
dcd44d |
+ case CPU_KNIGHTSLANDING:
|
|
|
dcd44d |
return CPU_ARCH_PERFMON;
|
|
|
dcd44d |
default:
|
|
|
dcd44d |
/* assume processor in a class by itself */
|
|
|
dcd44d |
diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h
|
|
|
dcd44d |
index 39b7726..e2f8f60 100644
|
|
|
dcd44d |
--- a/libop/op_cpu_type.h
|
|
|
dcd44d |
+++ b/libop/op_cpu_type.h
|
|
|
dcd44d |
@@ -106,6 +106,7 @@ typedef enum {
|
|
|
dcd44d |
CPU_SKYLAKE, /** < Intel Skylake microarchitecture */
|
|
|
dcd44d |
CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */
|
|
|
dcd44d |
CPU_PPC64_POWER9, /**< ppc64 POWER8 family */
|
|
|
dcd44d |
+ CPU_KNIGHTSLANDING, /** Intel Knights Landing microarchitecture */
|
|
|
dcd44d |
MAX_CPU_TYPE
|
|
|
dcd44d |
} op_cpu;
|
|
|
dcd44d |
|
|
|
dcd44d |
diff --git a/libop/op_events.c b/libop/op_events.c
|
|
|
dcd44d |
index 0ba57e0..acadaa7 100644
|
|
|
dcd44d |
--- a/libop/op_events.c
|
|
|
dcd44d |
+++ b/libop/op_events.c
|
|
|
dcd44d |
@@ -1204,6 +1204,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr)
|
|
|
dcd44d |
case CPU_WESTMERE:
|
|
|
dcd44d |
case CPU_SANDYBRIDGE:
|
|
|
dcd44d |
case CPU_IVYBRIDGE:
|
|
|
dcd44d |
+ case CPU_KNIGHTSLANDING:
|
|
|
dcd44d |
case CPU_MIPS_LOONGSON2:
|
|
|
dcd44d |
case CPU_FAMILY12H:
|
|
|
dcd44d |
case CPU_FAMILY14H:
|
|
|
dcd44d |
diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h
|
|
|
dcd44d |
index 2061760..840e7b3 100644
|
|
|
dcd44d |
--- a/libop/op_hw_specific.h
|
|
|
dcd44d |
+++ b/libop/op_hw_specific.h
|
|
|
dcd44d |
@@ -167,6 +167,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type)
|
|
|
dcd44d |
case 0x5c:
|
|
|
dcd44d |
case 0x5f:
|
|
|
dcd44d |
return CPU_GOLDMONT;
|
|
|
dcd44d |
+ case 0x57:
|
|
|
dcd44d |
+ case 0x85:
|
|
|
dcd44d |
+ return CPU_KNIGHTSLANDING;
|
|
|
dcd44d |
}
|
|
|
dcd44d |
}
|
|
|
dcd44d |
return cpu_type;
|
|
|
dcd44d |
diff --git a/utils/ophelp.c b/utils/ophelp.c
|
|
|
dcd44d |
index 6eb299c..f76bf2a 100644
|
|
|
dcd44d |
--- a/utils/ophelp.c
|
|
|
dcd44d |
+++ b/utils/ophelp.c
|
|
|
dcd44d |
@@ -554,6 +554,12 @@ int main(int argc, char const * argv[])
|
|
|
dcd44d |
"Intel Architecture Optimization Reference Manual\n\n";
|
|
|
dcd44d |
break;
|
|
|
dcd44d |
|
|
|
dcd44d |
+ case CPU_KNIGHTSLANDING:
|
|
|
dcd44d |
+ event_doc =
|
|
|
dcd44d |
+ "See Intel Xeon Phi(TM) Processor Performance Monitoring Reference and\n"
|
|
|
dcd44d |
+ "Intel Architecture Optimization Reference Manual\n\n";
|
|
|
dcd44d |
+ break;
|
|
|
dcd44d |
+
|
|
|
dcd44d |
case CPU_ARCH_PERFMON:
|
|
|
dcd44d |
event_doc =
|
|
|
dcd44d |
"See Intel 64 and IA-32 Architectures Software Developer's Manual\n"
|