diff --git a/SOURCES/oprofile-knl.patch b/SOURCES/oprofile-knl.patch new file mode 100644 index 0000000..53b2093 --- /dev/null +++ b/SOURCES/oprofile-knl.patch @@ -0,0 +1,231 @@ +commit 8050eb1d3095cc3b1e7a3344c85be4d9c591c089 +Author: Michael Petlan +Date: Tue May 16 23:21:47 2017 +0200 + + oprofile: Add support for Intel Xeon Phi (Knights Landing) + + Adds support for Intel Xeon Phi (Knights Landing and Knights Mill) + processors to oprofile. Only core events are supported. + + The events/umasks configuration has been taken from Intel Xeon Phi + Processor Performance Monitoring Reference Manual, volume 2. All + the events were tested on a Knights Mill machine. + + Signed-off-by: Michael Petlan + +diff --git a/events/Makefile.am b/events/Makefile.am +index b8f06af..13d063a 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -21,6 +21,7 @@ event_files = \ + i386/skylake/events i386/skylake/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ + i386/goldmont/events i386/goldmont/unit_masks \ ++ i386/knightslanding/events i386/knightslanding/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ + ia64/itanium/events ia64/itanium/unit_masks \ +diff --git a/events/i386/knightslanding/events b/events/i386/knightslanding/events +new file mode 100644 +index 0000000..d34feca +--- /dev/null ++++ b/events/i386/knightslanding/events +@@ -0,0 +1,26 @@ ++# ++# Intel "Knights Landing" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Knights Landing CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++include:i386/arch_perfmon ++event:0x03 counters:cpuid um:recycleq minimum:20000 name:recycleq : Counts the number of retired load or store micro-ops that get pushed into the Recycle Queue ++event:0x04 counters:cpuid um:mem_uops_retired minimum:100000 name:mem_uops_retired : Counts the number of memory micro-ops retired. ++event:0x05 counters:cpuid um:page_walks minimum:500 name:page_walks : Counts the number of core cycles for page walks ++event:0x30 counters:cpuid um:l2_requests_reject minimum:500 name:l2_requests_reject : Counts the number of MEC requests from the L2Q that reference a cache line were rejected. ++event:0x31 counters:cpuid um:core_reject_l2q minimum:100 name:core_reject_l2q : Number of requests not accepted into the L2Q because of any L2 queue reject condition. ++event:0x80 counters:cpuid um:icache minimum:100000 name:icache : Instruction fetches ++event:0x86 counters:cpuid um:fetch_stall minimum:100000 name:fetch_stall : Counts the number of core cycles the instruction fetch pipe was stalls ++event:0x2e counters:cpuid um:l2_requests minimum:10000 name:l2_requests : L2 cache requests ++event:0xc2 counters:cpuid um:uops_retired minimum:100000 name:uops_retired : Retired uops ++event:0xc3 counters:cpuid um:machine_clears minimum:500 name:machine_clears : Counts the number of times that the machine clears at retire. ++event:0xc4 counters:cpuid um:br_inst_retired minimum:50000 name:br_inst_retired : Counts the number of branch instructions retired ++event:0xc5 counters:cpuid um:br_misp_retired minimum:5000 name:br_misp_retired : Counts the number of mispredicted branch instructions retired ++event:0xca counters:cpuid um:no_alloc_cycles minimum:500000 name:no_alloc_cycles : Counts the number of core cycles when no micro-ops are allocated ++event:0xcb counters:cpuid um:rs_full_stall minimum:100000 name:rs_full_stall : Counts the number of core cycles when the allocate stalls because the required RS is full. ++event:0xcd counters:cpuid um:cycles_div_busy minimum:1000 name:cycles_div_busy : Number of core cycles when divider is busy ++event:0xe6 counters:cpuid um:baclears minimum:10000 name:baclears : Counts the number of times Branch Target Buffer (BTB) prediction was corrected by a later branch predictor ++event:0xe7 counters:cpuid um:ms_decoded minimum:10000 name:ms_decoded : Microcode sequencer decode entrypoints +diff --git a/events/i386/knightslanding/unit_masks b/events/i386/knightslanding/unit_masks +new file mode 100644 +index 0000000..b0e7910 +--- /dev/null ++++ b/events/i386/knightslanding/unit_masks +@@ -0,0 +1,91 @@ ++# ++# Unit masks for the Intel "Knights Landing" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Knights Landing CPUs ++# ++include:i386/arch_perfmon ++name:recycleq type:exclusive default:any_ld ++ 0x01 extra:pebs ld_block_st_forward Counts the number of occurrences a retired load gets blocked because its address partially overlaps with a store. ++ 0x02 extra: ld_block_std_notready Counts the number of occurrences a retired load gets blocked because its address overlaps with a store whose data is not ready. ++ 0x04 extra: st_splits Counts the number of occurrences a retired store that is a cache line split. Each split should be counted only once. ++ 0x08 extra:pebs ld_splits Counts the number of occurrences a retired load that is a cache line split. Each split should be counted only once. ++ 0x10 extra: lock Counts all the retired locked loads. It does not include stores because we would double count if we count stores. ++ 0x20 extra: sta_full Counts the store micro-ops retired that were pushed in the rehad queue because the store address buffer is full. ++ 0x40 extra: any_ld Counts any retired load that was pushed into the recycle queue for any reason. ++ 0x80 extra: any_st Counts any retired store that was pushed into the recycle queue for any reason. ++name:mem_uops_retired type:exclusive default:any_loads ++ 0x01 extra: l1_miss_loads Counts the number of load micro-ops retired that miss in L1 D cache. ++ 0x02 extra:pebs l2_hit_loads Counts the number of load micro-ops retired that hit in the L2. ++ 0x04 extra:pebs l2_miss_loads Counts the number of load micro-ops retired that miss in the L2. ++ 0x08 extra:pebs dtlb_miss_loads Counts the number of load micro-ops retired that cause a DTLB miss. ++ 0x10 extra: utlb_miss_loads Counts the number of load micro-ops retired that caused micro TLB miss. ++ 0x20 extra:pebs hitm Counts the loads retired that get the data from the other core in the same tile in M state. ++ 0x40 extra: any_loads Counts all the load micro-ops retired. ++ 0x80 extra: any_stores Counts all the store micro-ops retired. ++name:page_walks type:exclusive default:walks ++ 0x01 extra:edge d_side_walks Counts the total D-side page walks that are completed or started. The page walks started in the speculative path will also be counted. ++ 0x01 extra: d_side_cycles Counts the total number of core cycles for all the D-side page walks. The cycles for page walks started in speculative path will also be included. ++ 0x02 extra:edge i_side_walks Counts the total I-side page walks that are completed. ++ 0x02 extra: i_side_cycles Counts the total number of core cycles for all the I-side page walks. The cycles for page walks started in speculative path will also be included. ++ 0x03 extra:edge walks Counts the total page walks completed (I-side and D-side) ++ 0x03 extra: cycles Counts the total number of core cycles for all the page walks. The cycles for page walks started in speculative path will also be included. ++name:l2_requests_reject type:mandatory default:all ++ 0x00 extra: all Counts the number of MEC requests from the L2Q that reference a cache line excluding SW prefetches filling only to L2 cache and L1 evictions (automatically exlcudes L2HWP, UC, WC) that were rejected - Multiple repeated rejects should be counted multiple times. ++name:core_reject_l2q type:mandatory default:all ++ 0x00 extra: all Counts the number of MEC requests that were not accepted into the L2Q because of any L2 queue reject condition. There is no concept of at-ret here. It might include requests due to instructions in the speculative path ++name:icache type:exclusive default:accesses ++ 0x3 extra: accesses All instruction fetches including uncacheable ++ 0x1 extra: hits All instruction fetches that hit instruction cache ++ 0x2 extra: misses All instruction fetches that missed instruction cache (produced a memory request); counted only once, not once per outstanding cycle ++name:fetch_stall type:exclusive default:icache_fill_pending_cycles ++ 0x01 extra: icache_fill_pending_cycles Counts the number of core cycles the fetch stalls because of an icache miss. This is a cumulative count of core cycles the fetch stalled for all icache misses ++ 0x01 extra:edge icache_fill_pending_edge Counts the number of times it happens that fetch stalls because of an icache miss. ++name:l2_requests type:exclusive default:reference ++ 0x41 extra: miss Counts the total number of L2 cache misses. ++ 0x4f extra: reference Counts the total number of L2 cache references. ++name:uops_retired type:exclusive default:all ++ 0x01 extra: ms Counts the number of uops retired that are from complex flows issued by the micro-sequencer ++ 0x10 extra: all Counts the number of uops retired ++ 0x20 extra: scalar_simd Counts the number of scalar SSE, AVX, AVX2, AVX-512 micro-ops except for loads (memory-to-register mov-type micro ops), division, sqrt. ++ 0x40 extra: packed_simd Counts the number of packed SSE, AVX, AVX2, AVX-512 micro-ops (both floating point and integer) except for loads (memory-to-register mov-type micro-ops), packed byte and word multiplies. ++name:machine_clears type:exclusive default:all ++ 0x01 extra: smc Counts the number of times that the machine clears due to program modifying data within 1K of a recently fetched code page. ++ 0x02 extra: memory_ordering Counts the number of times the machine clears due to memory ordering hazards. ++ 0x04 extra: fp_assist Counts the number of floating operations retired that required microcode assists ++ 0x08 extra: all Counts all machine clears ++name:br_inst_retired type:exclusive default:any ++ 0x00 extra:pebs any Counts the number of branch instructions retired ++ 0x7e extra:pebs jcc Counts the number of branch instructions retired that were conditional jumps. ++ 0xfe extra:pebs taken_jcc Counts the number of branch instructions retired that were conditional jumps and predicted taken. ++ 0xf9 extra:pebs call Counts the number of near CALL branch instructions retired. ++ 0xfd extra:pebs rel_call Counts the number of near relative CALL branch instructions retired. ++ 0xfb extra:pebs ind_call Counts the number of near indirect CALL branch instructions retired. ++ 0xf7 extra:pebs return Counts the number of near RET branch instructions retired. ++ 0xeb extra:pebs non_return_ind Counts the number of branch instructions retired that were near indirect CALL or near indirect JMP. ++ 0xbf extra:pebs far_branch Counts the number of far branch instructions retired. ++name:br_misp_retired type:exclusive default:any ++ 0x00 extra:pebs any All mispredicted branches ++ 0x7e extra:pebs jcc Number of mispredicted conditional branch instructions retired ++ 0xfe extra:pebs taken_jcc Number of mispredicted taken conditional branch instructions retired ++ 0xf9 extra:pebs call Counts the number of mispredicted near CALL branch instructions retired. ++ 0xfd extra:pebs rel_call Counts the number of mispredicted near relative CALL branch instructions retired. ++ 0xfb extra:pebs ind_call Number of mispredicted indirect call branch instructions retired ++ 0xf7 extra:pebs return Number of mispredicted return branch instructions retired ++ 0xeb extra:pebs non_return_ind Number of mispredicted non-return branch instructions retired ++ 0xbf extra:pebs far_branch Counts the number of mispredicted far branch instructions retired. ++name:no_alloc_cycles type:exclusive default:all ++ 0x01 extra: rob_full Counts the number of core cycles when no micro-ops are allocated and the ROB is full ++ 0x02 extra: mispredicts Counts the number of core cycles when no micro-ops are allocated and the alloc pipe is stalled waiting for a mispredicted branch to retire. ++ 0x20 extra: rat_stall Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted. ++ 0x7f extra: all Counts the total number of core cycles when no micro-ops are allocated for any reason. ++name:rs_full_stall type:exclusive default:all ++ 0x01 extra: mec Counts the number of core cycles when allocation pipeline is stalled and is waiting for a free MEC reservation station entry. ++ 0x1f extra: all Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full. ++name:cycles_div_busy type:mandatory default:all ++ 0x01 extra: all Cycles the number of core cycles when divider is busy, does not imply a stall waiting for the divider ++name:baclears type:exclusive default:all ++ 0x01 extra: all Counts the number of times front-end resteers for any branch as a result of another branch handling mechanism in the front-end. ++ 0x08 extra: return Counts the number of times the front-end resteers for RET branches as a result of another branch handling mechanism in the front-end. ++ 0x10 extra: cond Counts the number of times the front-end resteers for conditional branches as a result of another branch handling mechanism in the front-end. ++name:ms_decoded type:mandatory default:ms_entry ++ 0x01 extra: ms_entry Counts the number of times the MSROM starts a flow of uops. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 7acecda..610121e 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -120,6 +120,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, + { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, + { "ppc64 POWER9", "ppc64/power9", CPU_PPC64_POWER9, 6 }, ++ { "Intel Knights Landing", "i386/knightslanding", CPU_KNIGHTSLANDING, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -752,6 +753,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: ++ case CPU_KNIGHTSLANDING: + return CPU_ARCH_PERFMON; + default: + /* assume processor in a class by itself */ +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 39b7726..e2f8f60 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -106,6 +106,7 @@ typedef enum { + CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ + CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ + CPU_PPC64_POWER9, /**< ppc64 POWER8 family */ ++ CPU_KNIGHTSLANDING, /** Intel Knights Landing microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 0ba57e0..acadaa7 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1204,6 +1204,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: ++ case CPU_KNIGHTSLANDING: + case CPU_MIPS_LOONGSON2: + case CPU_FAMILY12H: + case CPU_FAMILY14H: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 2061760..840e7b3 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -167,6 +167,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x5c: + case 0x5f: + return CPU_GOLDMONT; ++ case 0x57: ++ case 0x85: ++ return CPU_KNIGHTSLANDING; + } + } + return cpu_type; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 6eb299c..f76bf2a 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -554,6 +554,12 @@ int main(int argc, char const * argv[]) + "Intel Architecture Optimization Reference Manual\n\n"; + break; + ++ case CPU_KNIGHTSLANDING: ++ event_doc = ++ "See Intel Xeon Phi(TM) Processor Performance Monitoring Reference and\n" ++ "Intel Architecture Optimization Reference Manual\n\n"; ++ break; ++ + case CPU_ARCH_PERFMON: + event_doc = + "See Intel 64 and IA-32 Architectures Software Developer's Manual\n" diff --git a/SOURCES/oprofile-power9.patch b/SOURCES/oprofile-power9.patch index a1d96f6..67196e2 100644 --- a/SOURCES/oprofile-power9.patch +++ b/SOURCES/oprofile-power9.patch @@ -1187,3 +1187,34 @@ index f76bf2a..9a2a7dd 100644 break; case CPU_MIPS_20K: +commit 4a518527ba2226f57c6522693d55bd5a6a748a35 +Author: Will Schmidt +Date: Tue Jul 25 11:59:00 2017 -0500 + + power9, remove event entries with 0x00 values. + + A few of the events in the power9 list have event numbers of all zero, which + aren't useful to us. The events have valid values elsewhere in the tables, so + squash the zero entries. + + Due credit, this was noticed and reported to me by Michael Petlan (mpetlan@redhat.com). + + Signed-off-by: Will Schmidt + +diff --git a/events/ppc64/power9/events b/events/ppc64/power9/events +index 7264515..2116c9b 100644 +--- a/events/ppc64/power9/events ++++ b/events/ppc64/power9/events +@@ -865,12 +865,10 @@ event:0x000001688C counters:0 um:zero minimum:10000 name:PM_RC_USAGE : Co + event:0x00000468A6 counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : Read clearing SC + event:0x00000460A6 counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : Read forming SC + event:0x00000268A8 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : RD machine hit L3 PF machine +-event:0x0000000 counters:5 um:zero minimum:10000 name:PM_RUN_CYC : Run_cycles + event:0x00000200F4 counters:1 um:zero minimum:10000 name:PM_RUN_CYC : Run_cycles + event:0x000003006C counters:2 um:zero minimum:100000 name:PM_RUN_CYC_SMT2_MODE : Cycles in which this thread's run latch is set and the core is in SMT2 mode + event:0x000002006C counters:1 um:zero minimum:100000 name:PM_RUN_CYC_SMT4_MODE : Cycles in which this thread's run latch is set and the core is in SMT4 mode + event:0x000001006C counters:0 um:zero minimum:100000 name:PM_RUN_CYC_ST_MODE : Cycles run latch is set and core is in ST mode +-event:0x0000000 counters:4 um:zero minimum:10000 name:PM_RUN_INST_CMPL : Run_Instructions + event:0x00000400FA counters:3 um:zero minimum:10000 name:PM_RUN_INST_CMPL : Run_Instructions + event:0x00000400F4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR + event:0x0000010008 counters:0 um:zero minimum:10000 name:PM_RUN_SPURR : Run SPURR diff --git a/SPECS/oprofile.spec b/SPECS/oprofile.spec index c60786a..d9261bc 100644 --- a/SPECS/oprofile.spec +++ b/SPECS/oprofile.spec @@ -1,7 +1,7 @@ Summary: System wide profiler Name: oprofile Version: 0.9.9 -Release: 23%{?dist} +Release: 25%{?dist} License: GPLv2+ and LGPLv2+ Group: Development/System # @@ -41,6 +41,7 @@ Patch1007: oprofile-order.patch Patch1010: oprofile-rhbz1385007.patch Patch1011: oprofile-rhbz1426426.patch Patch2000: oprofile-power9.patch +Patch2001: oprofile-knl.patch URL: http://oprofile.sf.net @@ -134,6 +135,7 @@ agent library. %patch1010 -p1 -b .rhbz1385007 %patch1011 -p1 -b .rhbz1426426 %patch2000 -p1 -b .power9 +%patch2001 -p1 -b .knl ./autogen.sh @@ -220,6 +222,12 @@ exit 0 %{_sysconfdir}/ld.so.conf.d/* %changelog +* Mon Sep 25 2017 William Cohen - 0.9.9-25 +- Add Intel Xeon Phi support. rhbz1465354 + +* Thu Aug 31 2017 William Cohen - 0.9.9-24 +- Update power9 events. + * Thu Jun 22 2017 William Cohen - 0.9.9-23 - Add power9 support.