commit 8050eb1d3095cc3b1e7a3344c85be4d9c591c089 Author: Michael Petlan Date: Tue May 16 23:21:47 2017 +0200 oprofile: Add support for Intel Xeon Phi (Knights Landing) Adds support for Intel Xeon Phi (Knights Landing and Knights Mill) processors to oprofile. Only core events are supported. The events/umasks configuration has been taken from Intel Xeon Phi Processor Performance Monitoring Reference Manual, volume 2. All the events were tested on a Knights Mill machine. Signed-off-by: Michael Petlan diff --git a/events/Makefile.am b/events/Makefile.am index b8f06af..13d063a 100644 --- a/events/Makefile.am +++ b/events/Makefile.am @@ -21,6 +21,7 @@ event_files = \ i386/skylake/events i386/skylake/unit_masks \ i386/silvermont/events i386/silvermont/unit_masks \ i386/goldmont/events i386/goldmont/unit_masks \ + i386/knightslanding/events i386/knightslanding/unit_masks \ ia64/ia64/events ia64/ia64/unit_masks \ ia64/itanium2/events ia64/itanium2/unit_masks \ ia64/itanium/events ia64/itanium/unit_masks \ diff --git a/events/i386/knightslanding/events b/events/i386/knightslanding/events new file mode 100644 index 0000000..d34feca --- /dev/null +++ b/events/i386/knightslanding/events @@ -0,0 +1,26 @@ +# +# Intel "Knights Landing" microarchitecture core events. +# +# See http://ark.intel.com/ for help in identifying Knights Landing CPUs +# +# Note the minimum counts are not discovered experimentally and could be likely +# lowered in many cases without ill effect. +# +include:i386/arch_perfmon +event:0x03 counters:cpuid um:recycleq minimum:20000 name:recycleq : Counts the number of retired load or store micro-ops that get pushed into the Recycle Queue +event:0x04 counters:cpuid um:mem_uops_retired minimum:100000 name:mem_uops_retired : Counts the number of memory micro-ops retired. +event:0x05 counters:cpuid um:page_walks minimum:500 name:page_walks : Counts the number of core cycles for page walks +event:0x30 counters:cpuid um:l2_requests_reject minimum:500 name:l2_requests_reject : Counts the number of MEC requests from the L2Q that reference a cache line were rejected. +event:0x31 counters:cpuid um:core_reject_l2q minimum:100 name:core_reject_l2q : Number of requests not accepted into the L2Q because of any L2 queue reject condition. +event:0x80 counters:cpuid um:icache minimum:100000 name:icache : Instruction fetches +event:0x86 counters:cpuid um:fetch_stall minimum:100000 name:fetch_stall : Counts the number of core cycles the instruction fetch pipe was stalls +event:0x2e counters:cpuid um:l2_requests minimum:10000 name:l2_requests : L2 cache requests +event:0xc2 counters:cpuid um:uops_retired minimum:100000 name:uops_retired : Retired uops +event:0xc3 counters:cpuid um:machine_clears minimum:500 name:machine_clears : Counts the number of times that the machine clears at retire. +event:0xc4 counters:cpuid um:br_inst_retired minimum:50000 name:br_inst_retired : Counts the number of branch instructions retired +event:0xc5 counters:cpuid um:br_misp_retired minimum:5000 name:br_misp_retired : Counts the number of mispredicted branch instructions retired +event:0xca counters:cpuid um:no_alloc_cycles minimum:500000 name:no_alloc_cycles : Counts the number of core cycles when no micro-ops are allocated +event:0xcb counters:cpuid um:rs_full_stall minimum:100000 name:rs_full_stall : Counts the number of core cycles when the allocate stalls because the required RS is full. +event:0xcd counters:cpuid um:cycles_div_busy minimum:1000 name:cycles_div_busy : Number of core cycles when divider is busy +event:0xe6 counters:cpuid um:baclears minimum:10000 name:baclears : Counts the number of times Branch Target Buffer (BTB) prediction was corrected by a later branch predictor +event:0xe7 counters:cpuid um:ms_decoded minimum:10000 name:ms_decoded : Microcode sequencer decode entrypoints diff --git a/events/i386/knightslanding/unit_masks b/events/i386/knightslanding/unit_masks new file mode 100644 index 0000000..b0e7910 --- /dev/null +++ b/events/i386/knightslanding/unit_masks @@ -0,0 +1,91 @@ +# +# Unit masks for the Intel "Knights Landing" micro architecture +# +# See http://ark.intel.com/ for help in identifying Knights Landing CPUs +# +include:i386/arch_perfmon +name:recycleq type:exclusive default:any_ld + 0x01 extra:pebs ld_block_st_forward Counts the number of occurrences a retired load gets blocked because its address partially overlaps with a store. + 0x02 extra: ld_block_std_notready Counts the number of occurrences a retired load gets blocked because its address overlaps with a store whose data is not ready. + 0x04 extra: st_splits Counts the number of occurrences a retired store that is a cache line split. Each split should be counted only once. + 0x08 extra:pebs ld_splits Counts the number of occurrences a retired load that is a cache line split. Each split should be counted only once. + 0x10 extra: lock Counts all the retired locked loads. It does not include stores because we would double count if we count stores. + 0x20 extra: sta_full Counts the store micro-ops retired that were pushed in the rehad queue because the store address buffer is full. + 0x40 extra: any_ld Counts any retired load that was pushed into the recycle queue for any reason. + 0x80 extra: any_st Counts any retired store that was pushed into the recycle queue for any reason. +name:mem_uops_retired type:exclusive default:any_loads + 0x01 extra: l1_miss_loads Counts the number of load micro-ops retired that miss in L1 D cache. + 0x02 extra:pebs l2_hit_loads Counts the number of load micro-ops retired that hit in the L2. + 0x04 extra:pebs l2_miss_loads Counts the number of load micro-ops retired that miss in the L2. + 0x08 extra:pebs dtlb_miss_loads Counts the number of load micro-ops retired that cause a DTLB miss. + 0x10 extra: utlb_miss_loads Counts the number of load micro-ops retired that caused micro TLB miss. + 0x20 extra:pebs hitm Counts the loads retired that get the data from the other core in the same tile in M state. + 0x40 extra: any_loads Counts all the load micro-ops retired. + 0x80 extra: any_stores Counts all the store micro-ops retired. +name:page_walks type:exclusive default:walks + 0x01 extra:edge d_side_walks Counts the total D-side page walks that are completed or started. The page walks started in the speculative path will also be counted. + 0x01 extra: d_side_cycles Counts the total number of core cycles for all the D-side page walks. The cycles for page walks started in speculative path will also be included. + 0x02 extra:edge i_side_walks Counts the total I-side page walks that are completed. + 0x02 extra: i_side_cycles Counts the total number of core cycles for all the I-side page walks. The cycles for page walks started in speculative path will also be included. + 0x03 extra:edge walks Counts the total page walks completed (I-side and D-side) + 0x03 extra: cycles Counts the total number of core cycles for all the page walks. The cycles for page walks started in speculative path will also be included. +name:l2_requests_reject type:mandatory default:all + 0x00 extra: all Counts the number of MEC requests from the L2Q that reference a cache line excluding SW prefetches filling only to L2 cache and L1 evictions (automatically exlcudes L2HWP, UC, WC) that were rejected - Multiple repeated rejects should be counted multiple times. +name:core_reject_l2q type:mandatory default:all + 0x00 extra: all Counts the number of MEC requests that were not accepted into the L2Q because of any L2 queue reject condition. There is no concept of at-ret here. It might include requests due to instructions in the speculative path +name:icache type:exclusive default:accesses + 0x3 extra: accesses All instruction fetches including uncacheable + 0x1 extra: hits All instruction fetches that hit instruction cache + 0x2 extra: misses All instruction fetches that missed instruction cache (produced a memory request); counted only once, not once per outstanding cycle +name:fetch_stall type:exclusive default:icache_fill_pending_cycles + 0x01 extra: icache_fill_pending_cycles Counts the number of core cycles the fetch stalls because of an icache miss. This is a cumulative count of core cycles the fetch stalled for all icache misses + 0x01 extra:edge icache_fill_pending_edge Counts the number of times it happens that fetch stalls because of an icache miss. +name:l2_requests type:exclusive default:reference + 0x41 extra: miss Counts the total number of L2 cache misses. + 0x4f extra: reference Counts the total number of L2 cache references. +name:uops_retired type:exclusive default:all + 0x01 extra: ms Counts the number of uops retired that are from complex flows issued by the micro-sequencer + 0x10 extra: all Counts the number of uops retired + 0x20 extra: scalar_simd Counts the number of scalar SSE, AVX, AVX2, AVX-512 micro-ops except for loads (memory-to-register mov-type micro ops), division, sqrt. + 0x40 extra: packed_simd Counts the number of packed SSE, AVX, AVX2, AVX-512 micro-ops (both floating point and integer) except for loads (memory-to-register mov-type micro-ops), packed byte and word multiplies. +name:machine_clears type:exclusive default:all + 0x01 extra: smc Counts the number of times that the machine clears due to program modifying data within 1K of a recently fetched code page. + 0x02 extra: memory_ordering Counts the number of times the machine clears due to memory ordering hazards. + 0x04 extra: fp_assist Counts the number of floating operations retired that required microcode assists + 0x08 extra: all Counts all machine clears +name:br_inst_retired type:exclusive default:any + 0x00 extra:pebs any Counts the number of branch instructions retired + 0x7e extra:pebs jcc Counts the number of branch instructions retired that were conditional jumps. + 0xfe extra:pebs taken_jcc Counts the number of branch instructions retired that were conditional jumps and predicted taken. + 0xf9 extra:pebs call Counts the number of near CALL branch instructions retired. + 0xfd extra:pebs rel_call Counts the number of near relative CALL branch instructions retired. + 0xfb extra:pebs ind_call Counts the number of near indirect CALL branch instructions retired. + 0xf7 extra:pebs return Counts the number of near RET branch instructions retired. + 0xeb extra:pebs non_return_ind Counts the number of branch instructions retired that were near indirect CALL or near indirect JMP. + 0xbf extra:pebs far_branch Counts the number of far branch instructions retired. +name:br_misp_retired type:exclusive default:any + 0x00 extra:pebs any All mispredicted branches + 0x7e extra:pebs jcc Number of mispredicted conditional branch instructions retired + 0xfe extra:pebs taken_jcc Number of mispredicted taken conditional branch instructions retired + 0xf9 extra:pebs call Counts the number of mispredicted near CALL branch instructions retired. + 0xfd extra:pebs rel_call Counts the number of mispredicted near relative CALL branch instructions retired. + 0xfb extra:pebs ind_call Number of mispredicted indirect call branch instructions retired + 0xf7 extra:pebs return Number of mispredicted return branch instructions retired + 0xeb extra:pebs non_return_ind Number of mispredicted non-return branch instructions retired + 0xbf extra:pebs far_branch Counts the number of mispredicted far branch instructions retired. +name:no_alloc_cycles type:exclusive default:all + 0x01 extra: rob_full Counts the number of core cycles when no micro-ops are allocated and the ROB is full + 0x02 extra: mispredicts Counts the number of core cycles when no micro-ops are allocated and the alloc pipe is stalled waiting for a mispredicted branch to retire. + 0x20 extra: rat_stall Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted. + 0x7f extra: all Counts the total number of core cycles when no micro-ops are allocated for any reason. +name:rs_full_stall type:exclusive default:all + 0x01 extra: mec Counts the number of core cycles when allocation pipeline is stalled and is waiting for a free MEC reservation station entry. + 0x1f extra: all Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full. +name:cycles_div_busy type:mandatory default:all + 0x01 extra: all Cycles the number of core cycles when divider is busy, does not imply a stall waiting for the divider +name:baclears type:exclusive default:all + 0x01 extra: all Counts the number of times front-end resteers for any branch as a result of another branch handling mechanism in the front-end. + 0x08 extra: return Counts the number of times the front-end resteers for RET branches as a result of another branch handling mechanism in the front-end. + 0x10 extra: cond Counts the number of times the front-end resteers for conditional branches as a result of another branch handling mechanism in the front-end. +name:ms_decoded type:mandatory default:ms_entry + 0x01 extra: ms_entry Counts the number of times the MSROM starts a flow of uops. diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c index 7acecda..610121e 100644 --- a/libop/op_cpu_type.c +++ b/libop/op_cpu_type.c @@ -120,6 +120,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, { "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 }, + { "Intel Knights Landing", "i386/knightslanding", CPU_KNIGHTSLANDING, 4 }, }; static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); @@ -752,6 +753,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) case CPU_WESTMERE: case CPU_SANDYBRIDGE: case CPU_IVYBRIDGE: + case CPU_KNIGHTSLANDING: return CPU_ARCH_PERFMON; default: /* assume processor in a class by itself */ diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h index 39b7726..e2f8f60 100644 --- a/libop/op_cpu_type.h +++ b/libop/op_cpu_type.h @@ -106,6 +106,7 @@ typedef enum { CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ CPU_PPC64_POWER9, /**< ppc64 POWER8 family */ + CPU_KNIGHTSLANDING, /** Intel Knights Landing microarchitecture */ MAX_CPU_TYPE } op_cpu; diff --git a/libop/op_events.c b/libop/op_events.c index 0ba57e0..acadaa7 100644 --- a/libop/op_events.c +++ b/libop/op_events.c @@ -1204,6 +1204,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) case CPU_WESTMERE: case CPU_SANDYBRIDGE: case CPU_IVYBRIDGE: + case CPU_KNIGHTSLANDING: case CPU_MIPS_LOONGSON2: case CPU_FAMILY12H: case CPU_FAMILY14H: diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h index 2061760..840e7b3 100644 --- a/libop/op_hw_specific.h +++ b/libop/op_hw_specific.h @@ -167,6 +167,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) case 0x5c: case 0x5f: return CPU_GOLDMONT; + case 0x57: + case 0x85: + return CPU_KNIGHTSLANDING; } } return cpu_type; diff --git a/utils/ophelp.c b/utils/ophelp.c index 6eb299c..f76bf2a 100644 --- a/utils/ophelp.c +++ b/utils/ophelp.c @@ -554,6 +554,12 @@ int main(int argc, char const * argv[]) "Intel Architecture Optimization Reference Manual\n\n"; break; + case CPU_KNIGHTSLANDING: + event_doc = + "See Intel Xeon Phi(TM) Processor Performance Monitoring Reference and\n" + "Intel Architecture Optimization Reference Manual\n\n"; + break; + case CPU_ARCH_PERFMON: event_doc = "See Intel 64 and IA-32 Architectures Software Developer's Manual\n"