From 75c84fd503708aff275fdaa113e93e90847d7454 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Nov 15 2016 08:24:30 +0000 Subject: import devtoolset-6-oprofile-1.1.0-4.el7 --- diff --git a/.devtoolset-6-oprofile.metadata b/.devtoolset-6-oprofile.metadata new file mode 100644 index 0000000..43f54fa --- /dev/null +++ b/.devtoolset-6-oprofile.metadata @@ -0,0 +1,2 @@ +913be8e806779f7c6ba394113cf75da67d3cf03e SOURCES/openjdk-include.tar.gz +38c0d8812fe605f6ddd1cd183a482aa7605c0e81 SOURCES/oprofile-1.1.0.tar.gz diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e8cf836 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +SOURCES/openjdk-include.tar.gz +SOURCES/oprofile-1.1.0.tar.gz diff --git a/README.md b/README.md deleted file mode 100644 index 98f42b4..0000000 --- a/README.md +++ /dev/null @@ -1,4 +0,0 @@ -The master branch has no content - -Look at the c7 branch if you are working with CentOS-7, or the c4/c5/c6 branch for CentOS-4, 5 or 6 -If you find this file in a distro specific branch, it means that no content has been checked in yet diff --git a/SOURCES/oprofile-bz1335142.patch b/SOURCES/oprofile-bz1335142.patch new file mode 100644 index 0000000..ddfa4f1 --- /dev/null +++ b/SOURCES/oprofile-bz1335142.patch @@ -0,0 +1,40 @@ +commit a99127699330dce984dba38156230ab3584d0d6e +Author: William Cohen +Date: Mon Nov 30 17:13:32 2015 -0500 + + Make Intel Westmere and Nehalem event names unique + + The Intel Westmere and Nehalem event lists each had two events named + MACRO_INSTS. The event names in the event lists need to be unique. + The event refererring to the Macro-fused instructions decoded (0xa6) + has been renamed MACRO_INSTS_FUSED to avoid the name collision with + MACRO_INSTS. + + Signed-off-by: William Cohen + +diff --git a/events/i386/nehalem/events b/events/i386/nehalem/events +index 31a08b6..6951f35 100644 +--- a/events/i386/nehalem/events ++++ b/events/i386/nehalem/events +@@ -68,7 +68,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:6000 name:ILD_STALL : Cycles In + event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:6000 name:BR_INST_EXEC : Counts the number of near branch instructions executed, but not necessarily retired. + event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:6000 name:BR_MISP_EXEC : Counts the number of mispredicted conditional near branch instructions executed, but not necessarily retired. + event:0xA2 counters:0,1,2,3 um:resource_stalls minimum:6000 name:RESOURCE_STALLS : Counts the number of Allocator resource related stalls. Includes register renaming buffer entries, memory buffer entries. In addition to resource related stalls, this event counts some other events. Includes stalls arising during branch misprediction recovery, such as if retirement of the mispredicted branch is delayed and stalls arising while store buffer is draining from synchronizing operations. +-event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired. ++event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS_FUSED : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired. + event:0xA7 counters:0,1,2,3 um:one minimum:6000 name:BACLEAR_FORCE_IQ : Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ is also responsible for providing conditional branch prediciton direction based on a static scheme and dynamic data provided by the L2 Branch Prediction Unit. If the conditional branch target is not found in the Target Array and the IQ predicts that the branch is taken, then the IQ will force the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble in the instruction fetch pipeline. + event:0xA8 counters:0,1,2,3 um:one minimum:6000 name:LSD : Counts the number of micro-ops delivered by loop stream detector + event:0xAE counters:0,1,2,3 um:one minimum:6000 name:ITLB_FLUSH : Counts the number of ITLB flushes +diff --git a/events/i386/westmere/events b/events/i386/westmere/events +index d919867..d7b2064 100644 +--- a/events/i386/westmere/events ++++ b/events/i386/westmere/events +@@ -48,7 +48,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:2000000 name:ILD_STALL : Any In + event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:200000 name:BR_INST_EXEC : Branch instructions executed + event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:20000 name:BR_MISP_EXEC : Mispredicted branches executed + event:0xa2 counters:0,1,2,3 um:resource_stalls minimum:2000000 name:RESOURCE_STALLS : Resource related stall cycles +-event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS : Macro-fused instructions decoded ++event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS_FUSED : Macro-fused instructions decoded + event:0xa7 counters:0,1,2,3 um:x01 minimum:2000000 name:BACLEAR_FORCE_IQ : Instruction queue forced BACLEAR + event:0xa8 counters:0,1,2,3 um:x01 minimum:2000000 name:LSD : Cycles when uops were delivered by the LSD + event:0xae counters:0,1,2,3 um:x01 minimum:2000000 name:ITLB_FLUSH : ITLB flushes diff --git a/SOURCES/oprofile-goldmont.patch b/SOURCES/oprofile-goldmont.patch new file mode 100644 index 0000000..6210a98 --- /dev/null +++ b/SOURCES/oprofile-goldmont.patch @@ -0,0 +1,602 @@ +From 0ad5a9e6af86a88e1dd41180f45bc48b646eba6a Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Tue, 26 Apr 2016 07:52:51 -0700 +Subject: [PATCH 09/18] oprofile: Add support for Goldmont events + +Add support for the Intel Goldmont events. + +OFFCORE_RESPONSE.* is not supported. + +v2: Fix typos in descriptions. +v3: Add inst_retired.any_pebs +Signed-off-by: Andi Kleen +--- + events/Makefile.am | 1 + + events/i386/goldmont/events | 34 +++++++++ + events/i386/goldmont/unit_masks | 155 ++++++++++++++++++++++++++++++++++++++++ + libop/op_cpu_type.c | 2 + + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + libop/op_hw_specific.h | 3 + + utils/ophelp.c | 1 + + 8 files changed, 198 insertions(+) + create mode 100644 events/i386/goldmont/events + create mode 100644 events/i386/goldmont/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index 56f9020..677b05f 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -20,6 +20,7 @@ event_files = \ + i386/broadwell/events i386/broadwell/unit_masks \ + i386/skylake/events i386/skylake/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ ++ i386/goldmont/events i386/goldmont/unit_masks \ + ppc64/architected_events_v1/events ppc64/architected_events_v1/unit_masks \ + ppc64/power4/events ppc64/power4/event_mappings ppc64/power4/unit_masks \ + ppc64/power5/events ppc64/power5/event_mappings ppc64/power5/unit_masks \ +diff --git a/events/i386/goldmont/events b/events/i386/goldmont/events +new file mode 100644 +index 0000000..111438e +--- /dev/null ++++ b/events/i386/goldmont/events +@@ -0,0 +1,34 @@ ++# ++# Intel "Goldmont" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Goldmont based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x03 counters:cpuid um:ld_blocks minimum:200003 name:ld_blocks : ++event:0x05 counters:cpuid um:page_walks minimum:200003 name:page_walks : ++event:0x0e counters:cpuid um:uops_issued minimum:200003 name:uops_issued_any : ++event:0x13 counters:cpuid um:misalign_mem_ref minimum:200003 name:misalign_mem_ref : ++event:0x2e counters:cpuid um:longest_lat_cache minimum:200003 name:longest_lat_cache : ++event:0x30 counters:cpuid um:l2_reject_xq minimum:200003 name:l2_reject_xq_all : ++event:0x31 counters:cpuid um:core_reject_l2q minimum:200003 name:core_reject_l2q_all : ++event:0x51 counters:cpuid um:dl1 minimum:200003 name:dl1_dirty_eviction : ++event:0x80 counters:cpuid um:icache minimum:200003 name:icache : ++event:0x81 counters:cpuid um:itlb minimum:200003 name:itlb_miss : ++event:0x86 counters:cpuid um:fetch_stall minimum:200003 name:fetch_stall_icache_fill_pending_cycles : ++event:0x9c counters:cpuid um:uops_not_delivered minimum:200003 name:uops_not_delivered_any : ++event:0xc0 counters:cpuid um:inst_retired minimum:2000003 name:inst_retired : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:200003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:200003 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:200003 name:br_misp_retired : ++event:0xca counters:cpuid um:issue_slots_not_consumed minimum:200003 name:issue_slots_not_consumed : ++event:0xcb counters:cpuid um:hw_interrupts minimum:200003 name:hw_interrupts : ++event:0xcd counters:cpuid um:cycles_div_busy minimum:2000003 name:cycles_div_busy : ++event:0xd0 counters:cpuid um:mem_uops_retired minimum:200003 name:mem_uops_retired : ++event:0xd1 counters:cpuid um:mem_load_uops_retired minimum:200003 name:mem_load_uops_retired : ++event:0xe6 counters:cpuid um:baclears minimum:200003 name:baclears : ++event:0xe7 counters:cpuid um:ms_decoded minimum:200003 name:ms_decoded_ms_entry : ++event:0xe9 counters:cpuid um:decode_restriction minimum:200003 name:decode_restriction_predecode_wrong : +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +new file mode 100644 +index 0000000..2f265b3 +--- /dev/null ++++ b/events/i386/goldmont/unit_masks +@@ -0,0 +1,155 @@ ++# ++# Unit masks for the Intel "Goldmont" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Goldmont based CPUs ++# ++name:core_reject_l2q type:mandatory default:0x0 ++ 0x0 extra: all Counts the number of demand and L1 prefetcher requests rejected by the L2Q due to a full or nearly full condition which likely indicates back pressure from L2Q. It also counts requests that would have gone directly to the XQ, but are rejected due to a full or nearly full condition, indicating back pressure from the IDI link. The L2Q may also reject transactions from a core to insure fairness between cores, or to delay a core's dirty eviction when the address conflicts with incoming external snoops. ++name:decode_restriction type:mandatory default:0x1 ++ 0x1 extra: predecode_wrong Counts the number of times the prediction (from the predecode cache) for instruction length is incorrect. ++name:dl1 type:mandatory default:0x1 ++ 0x1 extra: dirty_eviction Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback. ++name:fetch_stall type:mandatory default:0x2 ++ 0x2 extra: icache_fill_pending_cycles Counts the number of cycles fetch stalls because of an icache miss. This is a cummulative count of cycles stalled for all icache misses. ++name:itlb type:mandatory default:0x4 ++ 0x4 extra: miss Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB. ++name:l2_reject_xq type:mandatory default:0x0 ++ 0x0 extra: all Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims. ++name:ms_decoded type:mandatory default:0x1 ++ 0x1 extra: ms_entry Counts the number of times the Microcde Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. ++name:uops_issued type:mandatory default:0x0 ++ 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. ++name:uops_not_delivered type:mandatory default:0x0 ++ 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. ++name:cpu_clk_unhalted type:exclusive default:core ++ 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. ++ 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event ++ 0x0 extra: core_p Core cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. ++ 0x1 extra: ref Reference cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. ++name:ld_blocks type:exclusive default:all_block ++ 0x10 extra: all_block Counts anytime a load that retires is blocked for any reason. ++ 0x10 extra:pebs all_block_pebs Counts anytime a load that retires is blocked for any reason. ++ 0x8 extra: utlb_miss Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x8 extra:pebs utlb_miss_pebs Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x1 extra: data_unknown Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. ++ 0x1 extra:pebs data_unknown_pebs Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. ++ 0x4 extra: u4k_alias Counts loads that block because their address modulo 4K matches a pending store. ++ 0x4 extra:pebs u4k_alias_pebs Counts loads that block because their address modulo 4K matches a pending store. ++name:page_walks type:exclusive default:0x1 ++ 0x1 extra: d_side_cycles Counts every core cycle when a Data-side walks (due to data operation) page walk is in progress. ++ 0x2 extra: i_side_cycles Counts every core cycle when a Instruction-side (walks due to an instruction fetch) page walk is in progress. ++ 0x3 extra: cycles Counts every core cycle a page-walk is in progress due to either a data memory operation or an instruction fetch. ++name:misalign_mem_ref type:exclusive default:load_page_split ++ 0x2 extra: load_page_split Counts when a memory load of a uop spans a page boundary (a split) is retired. ++ 0x2 extra:pebs load_page_split_pebs Counts when a memory load of a uop spans a page boundary (a split) is retired. ++ 0x4 extra: store_page_split Counts when a memory store of a uop spans a page boundary (a split) is retired. ++ 0x4 extra:pebs store_page_split_pebs Counts when a memory store of a uop spans a page boundary (a split) is retired. ++name:longest_lat_cache type:exclusive default:0x4f ++ 0x4f extra: reference Counts memory requests originating from the core that reference a cache line in the L2 cache. ++ 0x41 extra: miss Counts memory requests originating from the core that miss in the L2 cache. ++name:icache type:exclusive default:0x1 ++ 0x1 extra: hit Counts each cache line access to the Icache that are fulfilled (hit) by the Icache ++ 0x2 extra: misses Counts each cache line access to the Icache that are not fullfilled (miss) by the Icache ++ 0x3 extra: accesses Counts each cache line access to the Icache ++name:inst_retired type:exclusive default:any ++ 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event ++ 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. ++ 0x0 extra:pebs any_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++name:uops_retired type:exclusive default:any ++ 0x0 extra: any Counts uops which retired ++ 0x0 extra:pebs any_pebs Counts uops which retired ++ 0x1 extra: ms Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. ++ 0x1 extra:pebs ms_pebs Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. ++ 0x8 extra: fpdiv Counts the number of floating point divide uops retired. ++ 0x8 extra:pebs fpdiv_pebs Counts the number of floating point divide uops retired. ++ 0x10 extra: idiv Counts the number of integer divide uops retired. ++ 0x10 extra:pebs idiv_pebs Counts the number of integer divide uops retired. ++name:machine_clears type:exclusive default:0x0 ++ 0x0 extra: all Counts machine clears for any reason ++ 0x1 extra: smc Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors. ++ 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved, as another core is in the process of modifying the data. ++ 0x4 extra: fp_assist Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result. ++ 0x8 extra: disambiguation Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address. ++name:br_inst_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches Counts branch instructions retired for all branch types. This is an architectural performance event. ++ 0x0 extra:pebs all_branches_pebs Counts branch instructions retired for all branch types. This is an architectural performance event. ++ 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xf9 extra: call Counts near CALL branch instructions retired. ++ 0xf9 extra:pebs call_pebs Counts near CALL branch instructions retired. ++ 0xfd extra: rel_call Counts near relative CALL branch instructions retired. ++ 0xfd extra:pebs rel_call_pebs Counts near relative CALL branch instructions retired. ++ 0xfb extra: ind_call Counts near indirect CALL branch instructions retired. ++ 0xfb extra:pebs ind_call_pebs Counts near indirect CALL branch instructions retired. ++ 0xf7 extra: return Counts near return branch instructions retired. ++ 0xf7 extra:pebs return_pebs Counts near return branch instructions retired. ++ 0xeb extra: non_return_ind Counts near indirect call or near indirect jmp branch instructions retired. ++ 0xeb extra:pebs non_return_ind_pebs Counts near indirect call or near indirect jmp branch instructions retired. ++ 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++ 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++name:br_misp_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches Counts mispredicted branch instructions retired including all branch types. ++ 0x0 extra:pebs all_branches_pebs Counts mispredicted branch instructions retired including all branch types. ++ 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0xfe extra: taken_jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. ++ 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xfb extra:pebs ind_call_pebs Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xeb extra: non_return_ind Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. ++ 0xeb extra:pebs non_return_ind_pebs Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. ++name:issue_slots_not_consumed type:exclusive default:0x0 ++ 0x0 extra: any Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY) ++ 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. ++ 0x2 extra: recovery Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue. ++name:hw_interrupts type:exclusive default:0x1 ++ 0x1 extra: received Counts hardware interrupts received by the processor. ++ 0x4 extra: pending_and_masked Counts core cycles during which there are pending interrupts, but interrupts are masked (EFLAGS.IF = 0). ++name:cycles_div_busy type:exclusive default:0x0 ++ 0x0 extra: all Counts core cycles if either divide unit is busy. ++ 0x1 extra: idiv Counts core cycles the integer divide unit is busy. ++ 0x2 extra: fpdiv Counts core cycles the floating point divide unit is busy. ++name:mem_uops_retired type:exclusive default:all ++ 0x83 extra: all Counts the number of memory uops retired that is either a loads or a store or both. ++ 0x81 extra: all_loads Counts the number of load uops retired ++ 0x81 extra:pebs all_loads_pebs Counts the number of load uops retired ++ 0x82 extra: all_stores Counts the number of store uops retired ++ 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired ++ 0x83 extra:pebs all_pebs Counts the number of memory uops retired that is either a loads or a store or both. ++ 0x11 extra: dtlb_miss_loads Counts load uops retired that caused a DTLB miss. ++ 0x11 extra:pebs dtlb_miss_loads_pebs Counts load uops retired that caused a DTLB miss. ++ 0x12 extra: dtlb_miss_stores Counts store uops retired that caused a DTLB miss. ++ 0x12 extra:pebs dtlb_miss_stores_pebs Counts store uops retired that caused a DTLB miss. ++ 0x13 extra: dtlb_miss Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. ++ 0x13 extra:pebs dtlb_miss_pebs Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. ++ 0x21 extra: lock_loads Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. ++ 0x21 extra:pebs lock_loads_pebs Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. ++ 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++name:mem_load_uops_retired type:exclusive default:l1_hit ++ 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache ++ 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache ++ 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache ++ 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache ++ 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache ++ 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache ++ 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache ++ 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache ++ 0x20 extra: hitm Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. ++ 0x20 extra:pebs hitm_pebs Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. ++ 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++name:baclears type:exclusive default:0x1 ++ 0x1 extra: all Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns. ++ 0x8 extra: return Counts BACLEARS on return instructions. ++ 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Conditon is Met) branches. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index b1d5ecf..7bdde53 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -122,6 +122,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, ++ { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -739,6 +740,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_HASWELL: + case CPU_BROADWELL: + case CPU_SKYLAKE: ++ case CPU_GOLDMONT: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 9983f87..98289c5 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -102,6 +102,7 @@ typedef enum { + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ ++ CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 25f010e..cdd0409 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1212,6 +1212,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + descr->name = "CPU_CLK_UNHALTED"; + break; + ++ case CPU_GOLDMONT: + case CPU_SKYLAKE: + descr->name = "cpu_clk_unhalted"; + break; +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index a6180f4..f4db8f5 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -162,6 +162,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4d: + case 0x4c: + return CPU_SILVERMONT; ++ case 0x5c: ++ case 0x5f: ++ return CPU_GOLDMONT; + } + } + return cpu_type; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index fdddddc..5821593 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -544,6 +544,7 @@ int main(int argc, char const * argv[]) + case CPU_BROADWELL: + case CPU_SKYLAKE: + case CPU_SILVERMONT: ++ case CPU_GOLDMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +-- +2.7.4 + +From 6f2758a46554f69403e2ebc1a3e4a58350682638 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 6 May 2016 12:11:46 -0700 +Subject: [PATCH 12/18] oprofile: Update Goldmont events + +This patch adds some updates to the Goldmont events. Mainly it is editorial updates +to the event descriptions. In addition it also removes the events not listed +in the SDM (which were not intended to be included) + +v2: Minor edits +Signed-off-by: Andi Kleen +--- + events/i386/goldmont/unit_masks | 96 ++++++++++++++++++++--------------------- + 1 file changed, 47 insertions(+), 49 deletions(-) + +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +index 2f265b3..d1c08d4 100644 +--- a/events/i386/goldmont/unit_masks ++++ b/events/i386/goldmont/unit_masks +@@ -10,17 +10,17 @@ name:decode_restriction type:mandatory default:0x1 + name:dl1 type:mandatory default:0x1 + 0x1 extra: dirty_eviction Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback. + name:fetch_stall type:mandatory default:0x2 +- 0x2 extra: icache_fill_pending_cycles Counts the number of cycles fetch stalls because of an icache miss. This is a cummulative count of cycles stalled for all icache misses. ++ 0x2 extra: icache_fill_pending_cycles Counts cycles that an ICache miss is outstanding, and instruction fetch is stalled. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes, while an Icache miss outstanding. Note this event is not the same as cycles to retrieve an instruction due to an Icache miss. Rather, it is the part of the Instruction Cache (ICache) miss time where no bytes are available for the decoder. + name:itlb type:mandatory default:0x4 + 0x4 extra: miss Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB. + name:l2_reject_xq type:mandatory default:0x0 + 0x0 extra: all Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims. + name:ms_decoded type:mandatory default:0x1 +- 0x1 extra: ms_entry Counts the number of times the Microcde Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. ++ 0x1 extra: ms_entry Counts the number of times the Microcode Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. + name:uops_issued type:mandatory default:0x0 + 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. + name:uops_not_delivered type:mandatory default:0x0 +- 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. ++ 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into uops in machine understandable format and putting them into a uop queue to be consumed by back end. The back-end then takes these uops, allocates the required resources. When all resources are ready, uops are executed. If the back-end is not ready to accept uops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more uops. This event counts only when back-end is requesting more uops and front-end is not able to provide them. When 3 uops are requested and no uops are delivered, the event counts 3. When 3 are requested, and only 1 is delivered, the event counts 2. When only 2 are delivered, the event counts 1. Alternatively stated, the event will not count if 3 uops are delivered, or if the back end is stalled and not requesting any uops at all. Counts indicate missed opportunities for the front-end to deliver a uop to the back end. Some examples of conditions that cause front-end efficiencies are: ICache misses, ITLB misses, and decoder restrictions that limit the front-end bandwidth. Known Issues: Some uops require multiple allocation slots. These uops will not be charged as a front end 'not delivered' opportunity, and will be regarded as a back end problem. For example, the INC instruction has one uop that requires 2 issue slots. A stream of INC instructions will not count as UOPS_NOT_DELIVERED, even though only one instruction can be issued per clock. The low uop issue rate for a stream of INC instructions is considered to be a back end issue. + name:cpu_clk_unhalted type:exclusive default:core + 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. + 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event +@@ -31,12 +31,14 @@ name:ld_blocks type:exclusive default:all_block + 0x10 extra:pebs all_block_pebs Counts anytime a load that retires is blocked for any reason. + 0x8 extra: utlb_miss Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). + 0x8 extra:pebs utlb_miss_pebs Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x2 extra: store_forward Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted. ++ 0x2 extra:pebs store_forward_pebs Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted. + 0x1 extra: data_unknown Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. + 0x1 extra:pebs data_unknown_pebs Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. + 0x4 extra: u4k_alias Counts loads that block because their address modulo 4K matches a pending store. + 0x4 extra:pebs u4k_alias_pebs Counts loads that block because their address modulo 4K matches a pending store. + name:page_walks type:exclusive default:0x1 +- 0x1 extra: d_side_cycles Counts every core cycle when a Data-side walks (due to data operation) page walk is in progress. ++ 0x1 extra: d_side_cycles Counts every core cycle when a Data-side (walks due to a data operation) page walk is in progress. + 0x2 extra: i_side_cycles Counts every core cycle when a Instruction-side (walks due to an instruction fetch) page walk is in progress. + 0x3 extra: cycles Counts every core cycle a page-walk is in progress due to either a data memory operation or an instruction fetch. + name:misalign_mem_ref type:exclusive default:load_page_split +@@ -48,35 +50,31 @@ name:longest_lat_cache type:exclusive default:0x4f + 0x4f extra: reference Counts memory requests originating from the core that reference a cache line in the L2 cache. + 0x41 extra: miss Counts memory requests originating from the core that miss in the L2 cache. + name:icache type:exclusive default:0x1 +- 0x1 extra: hit Counts each cache line access to the Icache that are fulfilled (hit) by the Icache +- 0x2 extra: misses Counts each cache line access to the Icache that are not fullfilled (miss) by the Icache +- 0x3 extra: accesses Counts each cache line access to the Icache ++ 0x1 extra: hit Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is in the ICache (hit). The event strives to count on a cache line basis, so that multiple accesses which hit in a single cache line count as one ICACHE.HIT. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture. ++ 0x2 extra: misses Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is not in the ICache (miss). The event strives to count on a cache line basis, so that multiple accesses which miss in a single cache line count as one ICACHE.MISS. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is not in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture. ++ 0x3 extra: accesses Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line. The event strives to count on a cache line basis, so that multiple fetches to a single cache line count as one ICACHE.ACCESS. Specifically, the event counts when accesses from straight line code crosses the cache line boundary, or when a branch target is to a new line. This event counts differently than Intel processors based on Silvermont microarchitecture. + name:inst_retired type:exclusive default:any +- 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event +- 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. +- 0x0 extra:pebs any_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++ 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event. ++ 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++ 0x0 extra:pebs any_p_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. + name:uops_retired type:exclusive default:any + 0x0 extra: any Counts uops which retired + 0x0 extra:pebs any_pebs Counts uops which retired + 0x1 extra: ms Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. + 0x1 extra:pebs ms_pebs Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. +- 0x8 extra: fpdiv Counts the number of floating point divide uops retired. +- 0x8 extra:pebs fpdiv_pebs Counts the number of floating point divide uops retired. +- 0x10 extra: idiv Counts the number of integer divide uops retired. +- 0x10 extra:pebs idiv_pebs Counts the number of integer divide uops retired. + name:machine_clears type:exclusive default:0x0 + 0x0 extra: all Counts machine clears for any reason + 0x1 extra: smc Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors. +- 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved, as another core is in the process of modifying the data. ++ 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved - as another core is in the process of modifying the data. + 0x4 extra: fp_assist Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result. + 0x8 extra: disambiguation Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address. + name:br_inst_retired type:exclusive default:all_branches + 0x0 extra: all_branches Counts branch instructions retired for all branch types. This is an architectural performance event. + 0x0 extra:pebs all_branches_pebs Counts branch instructions retired for all branch types. This is an architectural performance event. +- 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. +- 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. +- 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. +- 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. + 0xf9 extra: call Counts near CALL branch instructions retired. + 0xf9 extra:pebs call_pebs Counts near CALL branch instructions retired. + 0xfd extra: rel_call Counts near relative CALL branch instructions retired. +@@ -87,24 +85,24 @@ name:br_inst_retired type:exclusive default:all_branches + 0xf7 extra:pebs return_pebs Counts near return branch instructions retired. + 0xeb extra: non_return_ind Counts near indirect call or near indirect jmp branch instructions retired. + 0xeb extra:pebs non_return_ind_pebs Counts near indirect call or near indirect jmp branch instructions retired. +- 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). +- 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++ 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. ++ 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. + name:br_misp_retired type:exclusive default:all_branches + 0x0 extra: all_branches Counts mispredicted branch instructions retired including all branch types. + 0x0 extra:pebs all_branches_pebs Counts mispredicted branch instructions retired including all branch types. +- 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). +- 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). + 0xfe extra: taken_jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. + 0xfe extra:pebs taken_jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. +- 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. +- 0xfb extra:pebs ind_call_pebs Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. +- 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. +- 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xfb extra:pebs ind_call_pebs counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. + 0xeb extra: non_return_ind Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. + 0xeb extra:pebs non_return_ind_pebs Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. + name:issue_slots_not_consumed type:exclusive default:0x0 + 0x0 extra: any Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY) +- 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. ++ 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited to resources such as the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. + 0x2 extra: recovery Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue. + name:hw_interrupts type:exclusive default:0x1 + 0x1 extra: received Counts hardware interrupts received by the processor. +@@ -117,8 +115,8 @@ name:mem_uops_retired type:exclusive default:all + 0x83 extra: all Counts the number of memory uops retired that is either a loads or a store or both. + 0x81 extra: all_loads Counts the number of load uops retired + 0x81 extra:pebs all_loads_pebs Counts the number of load uops retired +- 0x82 extra: all_stores Counts the number of store uops retired +- 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired ++ 0x82 extra: all_stores Counts the number of store uops retired. ++ 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired. + 0x83 extra:pebs all_pebs Counts the number of memory uops retired that is either a loads or a store or both. + 0x11 extra: dtlb_miss_loads Counts load uops retired that caused a DTLB miss. + 0x11 extra:pebs dtlb_miss_loads_pebs Counts load uops retired that caused a DTLB miss. +@@ -128,28 +126,28 @@ name:mem_uops_retired type:exclusive default:all + 0x13 extra:pebs dtlb_miss_pebs Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. + 0x21 extra: lock_loads Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. + 0x21 extra:pebs lock_loads_pebs Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. +- 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundry. +- 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundry. +- 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundry. +- 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundry. +- 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundry. +- 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundary. + name:mem_load_uops_retired type:exclusive default:l1_hit +- 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache +- 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache +- 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache +- 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache +- 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache +- 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache +- 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache +- 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache ++ 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache. ++ 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache. ++ 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache. ++ 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache. ++ 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache. ++ 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache. ++ 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache. ++ 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache. + 0x20 extra: hitm Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. + 0x20 extra:pebs hitm_pebs Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. +- 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. +- 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. +- 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. +- 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. + name:baclears type:exclusive default:0x1 + 0x1 extra: all Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns. + 0x8 extra: return Counts BACLEARS on return instructions. +- 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Conditon is Met) branches. ++ 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Condition is Met) branches. +-- +2.7.4 + +From b3c20ae8b52c10aa631ca0b931388df98ca3183d Mon Sep 17 00:00:00 2001 +From: Michael Petlan +Date: Fri, 23 Sep 2016 13:35:54 +0200 +Subject: [PATCH 17/18] Intel Goldmont default event + +Hi all, + +when testing oprofile on an Intel Goldmont machine, I have found out +that the default event cpu_clk_unhalted returns always zero. Thus, I +checked the configuration and Intel SDM, and I think there must be a +mistake. + +According to the Intel SDM, table 19-24, the event is 0x3c as usual. +It has two unit masks (0x00 (core_p) and 0x01 (ref)). With this, the +event starts giving reasonable results. + +The current configuration which is coded in oprofile is not even in +the SDM tale 19-24, so it is expectable that the following will give +zero value: + +perf stat -e cpu/event=0x00,umask=0x02/ ls + +Please consider applying the attached patch. + +CC'ing Andi to verify the fix. + +Thank you, +Michael + +commit df73e385442236fd6e763cc192185c606e59feda +Author: Michael Petlan +Date: Fri Sep 23 13:16:00 2016 +0200 + + Fixed default event on Intel Goldmont + + According to the Intel SDM, table 19-24, the event cpu_clk_unhalted + has the event number 0x3c and has two unit masks (0x00, 0x01). This + also corresponds to other Intels where the event is also 0x3c. + + Tested on a Goldmont Harrisonville (model 95). + + Before the patch: + + $ ocount ls + Events were actively counted for 1761229 nanoseconds. + Event counts (actual) for /usr/bin/ls: + Event Count % time counted + cpu_clk_unhalted 0 100.00 + + After the patch: + + Event counts (actual) for /usr/bin/ls: + Event Count % time counted + cpu_clk_unhalted 2,948,142 100.00 + + Signed-off-by: Michael Petlan +--- + events/i386/goldmont/events | 2 +- + events/i386/goldmont/unit_masks | 4 +--- + 2 files changed, 2 insertions(+), 4 deletions(-) + +diff --git a/events/i386/goldmont/events b/events/i386/goldmont/events +index 111438e..89cbc59 100644 +--- a/events/i386/goldmont/events ++++ b/events/i386/goldmont/events +@@ -6,7 +6,7 @@ + # Note the minimum counts are not discovered experimentally and could be likely + # lowered in many cases without ill effect. + # +-event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x3c counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : + event:0x03 counters:cpuid um:ld_blocks minimum:200003 name:ld_blocks : + event:0x05 counters:cpuid um:page_walks minimum:200003 name:page_walks : + event:0x0e counters:cpuid um:uops_issued minimum:200003 name:uops_issued_any : +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +index d1c08d4..9d93da0 100644 +--- a/events/i386/goldmont/unit_masks ++++ b/events/i386/goldmont/unit_masks +@@ -21,9 +21,7 @@ name:uops_issued type:mandatory default:0x0 + 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. + name:uops_not_delivered type:mandatory default:0x0 + 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into uops in machine understandable format and putting them into a uop queue to be consumed by back end. The back-end then takes these uops, allocates the required resources. When all resources are ready, uops are executed. If the back-end is not ready to accept uops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more uops. This event counts only when back-end is requesting more uops and front-end is not able to provide them. When 3 uops are requested and no uops are delivered, the event counts 3. When 3 are requested, and only 1 is delivered, the event counts 2. When only 2 are delivered, the event counts 1. Alternatively stated, the event will not count if 3 uops are delivered, or if the back end is stalled and not requesting any uops at all. Counts indicate missed opportunities for the front-end to deliver a uop to the back end. Some examples of conditions that cause front-end efficiencies are: ICache misses, ITLB misses, and decoder restrictions that limit the front-end bandwidth. Known Issues: Some uops require multiple allocation slots. These uops will not be charged as a front end 'not delivered' opportunity, and will be regarded as a back end problem. For example, the INC instruction has one uop that requires 2 issue slots. A stream of INC instructions will not count as UOPS_NOT_DELIVERED, even though only one instruction can be issued per clock. The low uop issue rate for a stream of INC instructions is considered to be a back end issue. +-name:cpu_clk_unhalted type:exclusive default:core +- 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. +- 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event ++name:cpu_clk_unhalted type:exclusive default:core_p + 0x0 extra: core_p Core cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. + 0x1 extra: ref Reference cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. + name:ld_blocks type:exclusive default:all_block +-- +2.7.4 + diff --git a/SOURCES/oprofile-kabylake.patch b/SOURCES/oprofile-kabylake.patch new file mode 100644 index 0000000..f53d4e8 --- /dev/null +++ b/SOURCES/oprofile-kabylake.patch @@ -0,0 +1,28 @@ +From 402cad1b6f5605ed854eb8b7b7376cafce3fb007 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 29 Apr 2016 17:50:25 -0700 +Subject: [PATCH 11/18] oprofile: Add model numbers for Kabylake CPUs + +The PMU is using the same events as Skylake, so no other changes. + +Signed-off-by: Andi Kleen +--- + libop/op_hw_specific.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index f4db8f5..2061760 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -157,6 +157,8 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4e: + case 0x5e: + case 0x55: ++ case 0x8e: ++ case 0x9e: + return CPU_SKYLAKE; + case 0x37: + case 0x4d: +-- +2.7.4 + diff --git a/SOURCES/oprofile-oparchive.patch b/SOURCES/oprofile-oparchive.patch new file mode 100644 index 0000000..42dae98 --- /dev/null +++ b/SOURCES/oprofile-oparchive.patch @@ -0,0 +1,119 @@ +From 3a5515f85ea2e007343c225e78cba66dde133327 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Wed, 6 Jul 2016 10:53:51 -0400 +Subject: [PATCH 14/18] Store samples in the archive and search the appropriate + places for samples + +Newer versions of oprofile use a oprofile_data directory in the +current working directory to store the samples. This presents a +complications when data archived with oparchive. The oparchive needs +to include samples in the archive. The code also needs to make sure +that samples in the archive are used and not samples from a +oprofile_data directory in the current working directory. + +Signed-off-by: William Cohen +--- + libpp/profile_spec.cpp | 2 ++ + pp/oparchive.cpp | 39 ++++++++++++++++++++++++++++----------- + pp/oparchive_options.cpp | 5 ++--- + 3 files changed, 32 insertions(+), 14 deletions(-) + +diff --git a/libpp/profile_spec.cpp b/libpp/profile_spec.cpp +index cd4bd80..a10d6cc 100644 +--- a/libpp/profile_spec.cpp ++++ b/libpp/profile_spec.cpp +@@ -102,6 +102,8 @@ void profile_spec::set_image_or_lib_name(string const & str) + void profile_spec::parse_archive_path(string const & str) + { + archive_path = op_realpath(str); ++ /* Need to force session directory default location in the archive */ ++ init_op_config_dirs(OP_SESSION_DIR_DEFAULT); + } + + +diff --git a/pp/oparchive.cpp b/pp/oparchive.cpp +index 5b6906d..6221e14 100644 +--- a/pp/oparchive.cpp ++++ b/pp/oparchive.cpp +@@ -232,6 +232,19 @@ int oparchive(options::spec const & spec) + } + } + ++ /* place samples and other related material in easily found default directory */ ++ string dest_session_dir = options::outdirectory + string(OP_SESSION_DIR_DEFAULT); ++ string dest_samples_dir = dest_session_dir + string("samples"); ++ ++ /* dest_session_dir is parent of dest_samples and will also created */ ++ ++ if (!options::list_files && ++ create_path(dest_samples_dir.c_str())) { ++ cerr << "Unable to create directory for " ++ << dest_samples_dir << "." << endl; ++ exit (EXIT_FAILURE); ++ } ++ + /* copy over each of the sample files */ + list::iterator sit = sample_files.begin(); + list::iterator const send = sample_files.end(); +@@ -245,9 +258,13 @@ int oparchive(options::spec const & spec) + + for (; sit != send; ++sit) { + string sample_name = *sit; ++ /* determine the session name of sample file */ ++ int offset = sample_name.find('{'); ++ string base_samples_dir = sample_name.substr(0, offset-1); ++ string session = basename(base_samples_dir.c_str()); + /* Get rid of the the archive_path from the name */ +- string sample_base = sample_name.substr(archive_path.size()); +- string sample_archive_file = options::outdirectory + sample_base; ++ string sample_base = sample_name.substr(offset); ++ string sample_archive_file = dest_samples_dir + "/" + session + "/" + sample_base; + + cverb << vdebug << sample_name << endl; + cverb << vdebug << " destp " << sample_archive_file << endl; +@@ -268,19 +285,19 @@ int oparchive(options::spec const & spec) + cerr << "Unable to to obtain realpath for " << op_session_dir << endl; + exit (EXIT_FAILURE); + } +- string abi_name = string(real_session_dir) + "/abi"; +- copy_one_file(image_ok, archive_path + abi_name, +- options::outdirectory + abi_name); ++ string abi_name = string(real_session_dir) + string("/abi"); ++ string dest_abi_name = dest_session_dir + string("/abi"); ++ copy_one_file(image_ok, archive_path + abi_name, dest_abi_name); + + /* copy over the /samples/oprofiled.log file */ +- string log_name = string(real_session_dir) + string("/samples") + "/oprofiled.log"; +- copy_one_file(image_ok, archive_path + log_name, +- options::outdirectory + log_name); ++ string log_name = string(real_session_dir) + string("/samples") + string("/oprofiled.log"); ++ string dest_log_name = dest_samples_dir + string("/oprofiled.log"); ++ copy_one_file(image_ok, archive_path + log_name, dest_log_name); + + /* copy over the /samples/operf.log file */ +- log_name = string(real_session_dir) + string("/samples") + "/operf.log"; +- copy_one_file(image_ok, archive_path + log_name, +- options::outdirectory + log_name); ++ log_name = string(real_session_dir) + string("/samples") + string("/operf.log"); ++ dest_log_name = dest_samples_dir + string("/operf.log"); ++ copy_one_file(image_ok, archive_path + log_name, dest_log_name); + + free(real_session_dir); + +diff --git a/pp/oparchive_options.cpp b/pp/oparchive_options.cpp +index e6f2ddc..b79bf13 100644 +--- a/pp/oparchive_options.cpp ++++ b/pp/oparchive_options.cpp +@@ -124,7 +124,6 @@ void handle_options(options::spec const & spec) + + if (strncmp(op_session_dir, OP_SESSION_DIR_DEFAULT, strlen(OP_SESSION_DIR_DEFAULT))) + cerr << "NOTE: The sample data in this archive is located at " << op_session_dir << endl +- << "instead of the standard location of " << OP_SESSION_DIR_DEFAULT << ". Hence, when using opreport" << endl +- << "and other post-processing tools on this archive, you must pass the following option:" << endl +- << "\t--session-dir=" << op_session_dir << endl; ++ << "and is being moved to the standard location of " << OP_SESSION_DIR_DEFAULT << "." ++ << endl; + } +-- +2.7.4 + diff --git a/SOURCES/oprofile-power.patch b/SOURCES/oprofile-power.patch new file mode 100644 index 0000000..2f3a78e --- /dev/null +++ b/SOURCES/oprofile-power.patch @@ -0,0 +1,793 @@ +From 34715734fd6f4b44f32206541c8a2500514c9922 Mon Sep 17 00:00:00 2001 +From: "Carl E. Love" +Date: Fri, 13 Nov 2015 12:27:56 -0800 +Subject: [PATCH 02/18] Remove Powerpc OProfile events the kernel will reject + +This patch comments out a number of events with "## note 1" to indicate +that the events are not supported. A "note 1" is added to the beginning +of the file explaining the issue. The issue is the events require the +setting of a register by the kernel when setting up to measure the +events that is only writable by the hypervisor. Currently, there is no +API allowing the kernel to request the required field of the register be +changed by the hypervisor. The events can be re-enabled if an API is +created for the OS to request the hypervisor set the bits. + +Signed-off-by: Carl Love +--- + events/ppc64/power8/events | 312 +++++++++++++++++++++++---------------------- + 1 file changed, 163 insertions(+), 149 deletions(-) + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 012ca89..851299d 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -7,6 +7,20 @@ + + include:ppc64/architected_events_v1 + ++#note 1. 11/12/2015 ++# ++# These event requires the cache selector bits to be set to a non-zero ++# value in the processor performance counter setup register. On Power 8, this ++# register is only writable by the hypervisor. So the kernel must reject any ++# event where the lower three cache selector bits (bits 22:20) are not equal ++# to 0. If/when an API is implemented to allow the kernel to request the ++# hypervisor write the register with the required value, these events can be ++# re-added to the list of supported events. The issue is documented in the ++# powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() ++# where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If ++# cache bits are not zero, the function returns -1 to reject the event. ++ ++ + event:0x1f05e counters:0 um:zero minimum:100000 name:PM_1LPAR_CYC : Number of cycles in single lpar mode. + event:0x2006e counters:1 um:zero minimum:10000 name:PM_2LPAR_CYC : Number of cycles in 2 lpar mode. + event:0x4e05e counters:3 um:zero minimum:100000 name:PM_4LPAR_CYC : Number of cycles in 4 LPAR mode. +@@ -89,49 +103,49 @@ event:0x4d012 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR_LONG : + event:0x2d012 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VSU : Completion stall due to VSU instruction. + event:0x16083 counters:0 um:zero minimum:10000 name:PM_CO0_ALLOC : 0.0 + event:0x16082 counters:0 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-event:0x517082 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy +-event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) ++## note 1 event:0x517082 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy ++## note 1 event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) + event:0x3608a counters:2 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x40066 counters:3 um:zero minimum:10000 name:PM_CRU_FIN : IFU Finished a (non-branch) instruction. +-event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load +-event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load ++## note 1 event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 + event:0x62c040 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load +-event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro +-event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load +-event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load +-event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load +-event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load +-event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or +-event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load ++## note 1 event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++## note 1 event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load ++## note 1 event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++## note 1 event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load ++## note 1 event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++## note 1 event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load ++## note 1 event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load ++## note 1 event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++## note 1 event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load + event:0x1c050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load. + event:0x4c048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. + event:0x3c048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. +@@ -430,11 +444,11 @@ event:0x25046 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_MOD : A + event:0x1504a counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request. + event:0x2504a counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a instruction side request. + event:0x3504a counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a instruction side request. +-event:0x617082 counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All i-side dispatch attempts +-event:0x627084 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL : All i-side dispatch attempts that failed due to a addr collision with another machine +-event:0x627086 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All i-side dispatch attempts that failed due to a reason other than addrs collision ++## note 1 event:0x617082 counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All i-side dispatch attempts ++## note 1 event:0x627084 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL : All i-side dispatch attempts that failed due to a addr collision with another machine ++## note 1 event:0x627086 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All i-side dispatch attempts that failed due to a reason other than addrs collision + event:0x4608e counters:3 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : valid when first beat of data comes in for an i-side fetch where data came from mem(or L4) +-event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch ++## note 1 event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch + event:0xd096 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : I SLB Miss. + event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX0 : FX0 ISU reject + event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX1 : FX1 ISU reject +@@ -451,107 +465,107 @@ event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 IS + event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject + event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU + event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread +-event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc +-event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc +-event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc +-event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++## note1 event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++## note1 event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++## note1 event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++## note1 event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc + event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) + event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . + event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 + event:0x40012 counters:3 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_ALL : Counts all Icache reloads includes demand, prefetchm prefetch turned into demand and demand turned into prefetch. + event:0x30068 counters:2 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_PREF : Counts all Icache prefetch reloads ( includes demand turned into prefetch). +-event:0x417080 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M, Mu, Me) +-event:0x417082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (T, Te, Si, S) ++## note 1 event:0x417080 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M, Mu, Me) ++## note 1 event:0x417082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (T, Te, Si, S) + event:0x27084 counters:1 um:zero minimum:10000 name:PM_L2_CHIP_PUMP : RC requests that were local on chip pump attempts +-event:0x427086 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : Dcache invalidates from L2 +-event:0x44608c counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2miss. +-event:0x64608e counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on Node Pump attempts +-event:0x626084 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp and guess was correct (data intra-6chip AND ^on-chip) +-event:0x626086 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip) +-event:0x427084 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : Icache Invalidates from L2 +-event:0x436088 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) +-event:0x43608a counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs) +-event:0x416080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread +-event:0x437088 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful load dispatches +-event:0x43708a counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful load dispatches that were L2 hits +-event:0x426084 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2miss for this thread +-event:0x616080 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess loc and guess was correct (ie data local) +-event:0x616082 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess loc and guess was not correct (ie data not on chip) +-event:0x516080 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : L2 RC load dispatch attempt +-event:0x516082 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ +-event:0x526084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : L2 RC load dispatch attempt failed due to other reasons +-event:0x536088 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : L2 RC store dispatch attempt +-event:0x53608a counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ +-event:0x54608c counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : L2 RC store dispatch attempt failed due to other reasons +-event:0x537088 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did st to line that was Tx or Sx +-event:0x63708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core ++## note 1 event:0x427086 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : Dcache invalidates from L2 ++## note 1 event:0x44608c counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2miss. ++## note 1 event:0x64608e counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on Node Pump attempts ++## note 1 event:0x626084 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp and guess was correct (data intra-6chip AND ^on-chip) ++## note 1 event:0x626086 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip) ++## note 1 event:0x427084 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : Icache Invalidates from L2 ++## note 1 event:0x436088 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) ++## note 1 event:0x43608a counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs) ++## note 1 event:0x416080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread ++## note 1 event:0x437088 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful load dispatches ++## note 1 event:0x43708a counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful load dispatches that were L2 hits ++## note 1 event:0x426084 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2miss for this thread ++## note 1 event:0x616080 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess loc and guess was correct (ie data local) ++## note 1 event:0x616082 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess loc and guess was not correct (ie data not on chip) ++## note 1 event:0x516080 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : L2 RC load dispatch attempt ++## note 1 event:0x516082 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ ++## note 1 event:0x526084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : L2 RC load dispatch attempt failed due to other reasons ++## note 1 event:0x536088 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : L2 RC store dispatch attempt ++## note 1 event:0x53608a counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ ++## note 1 event:0x54608c counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : L2 RC store dispatch attempt failed due to other reasons ++## note 1 event:0x537088 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did st to line that was Tx or Sx ++## note 1 event:0x63708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core + event:0x3708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_ST : RC retries on PB for any store from core +-event:0x54708c counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M +-event:0x54708e counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M +-event:0x53708a counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx or Tx to Ix ++## note 1 event:0x54708c counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M ++## note 1 event:0x54708e counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M ++## note 1 event:0x53708a counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx or Tx to Ix + event:0x17080 counters:0 um:zero minimum:10000 name:PM_L2_ST : All successful D-side store dispatches for this thread +-event:0x44708c counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful store dispatches +-event:0x44708e counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful store dispatches that were L2Hits ++## note 1 event:0x44708c counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful store dispatches ++## note 1 event:0x44708e counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful store dispatches that were L2Hits + event:0x17082 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS : All successful D-side store dispatches for this thread that were L2 Miss +-event:0x636088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess sys and guess was correct (ie data beyond-6chip) +-event:0x63608a counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess sys and guess was not correct (ie data ^beyond-6chip) ++## note 1 event:0x636088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess sys and guess was correct (ie data beyond-6chip) ++## note 1 event:0x63608a counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess sys and guess was not correct (ie data ^beyond-6chip) + event:0x37088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts + event:0x1e05e counters:0 um:zero minimum:10000 name:PM_L2_TM_REQ_ABORT : TM abort. + event:0x3e05c counters:2 um:zero minimum:10000 name:PM_L2_TM_ST_ABORT_SISTER : TM marked store abort. +-event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject +-event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count +-event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count ++## note1 event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject ++## note1 event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count ++## note1 event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count + event:0x819082 counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : rotating sample of 16 CI or CO actives +-event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) ++## note 1 event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) + event:0x83908b counters:2 um:zero minimum:10000 name:PM_L3_CO0_ALLOC : 0.0 + event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, sample of CO machine 0 valid + event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) +-event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO ++## note1 event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO + event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) + event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough) +-event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) +-event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high +-event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low +-event:0x218080 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits +-event:0x138088 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 castout hits +-event:0x13808a counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 castout miss +-event:0x14808c counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit +-event:0x14808e counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss +-event:0x228084 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 demand LD Hits +-event:0x228086 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 demand LD Miss ++## note 1 event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) ++## note 1 event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high ++## note 1 event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low ++## note 1 event:0x218080 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits ++## note 1 event:0x138088 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 castout hits ++## note 1 event:0x13808a counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 castout miss ++## note 1 event:0x14808c counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit ++## note 1 event:0x14808e counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss ++## note 1 event:0x228084 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 demand LD Hits ++## note 1 event:0x228086 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 demand LD Miss + event:0x1e052 counters:0 um:zero minimum:10000 name:PM_L3_LD_PREF : L3 Load Prefetches. +-event:0xb19080 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip and data from local node (local) (pred successful) +-event:0xb29086 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node but data from out side local node (near or far or rem). Prediction too Low +-event:0x218082 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses +-event:0x54808c counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : l3 CO to L3.1 (lco) port 0 +-event:0x538088 counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : l3 CO to memory port 0 +-event:0x929084 counters:1 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 +-event:0xa29084 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 pf sent with grp scope port 0 +-event:0x528084 counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : lco sent with data port 0 +-event:0x518080 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : dataless l3 lco sent port 0 +-event:0xa4908c counters:3 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 LCO received retry port 0 +-event:0xa19080 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 pf sent with nodal scope port 0 +-event:0x919080 counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0 +-event:0x939088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SN_HIT : L3 snoop hit port 0 +-event:0x118080 counters:0 um:zero minimum:10000 name:PM_L3_P0_SN_INV : Port0 snooper detects someone doing a store to a line thats Sx +-event:0x94908c counters:3 um:zero minimum:10000 name:PM_L3_P0_SN_MISS : L3 snoop miss port 0 +-event:0xa39088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 pf sent with sys scope port 0 +-event:0x54808e counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : l3 CO to L3.1 (lco) port 1 +-event:0x53808a counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : l3 CO to memory port 1 +-event:0x929086 counters:1 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 +-event:0xa29086 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 pf sent with grp scope port 1 +-event:0x528086 counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : lco sent with data port 1 +-event:0x518082 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : dataless l3 lco sent port 1 +-event:0xa4908e counters:3 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 LCO received retry port 1 +-event:0xa19082 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 pf sent with nodal scope port 1 +-event:0x919082 counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1 +-event:0x93908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SN_HIT : L3 snoop hit port 1 +-event:0x118082 counters:0 um:zero minimum:10000 name:PM_L3_P1_SN_INV : Port1 snooper detects someone doing a store to a line thats Sx +-event:0x94908e counters:3 um:zero minimum:10000 name:PM_L3_P1_SN_MISS : L3 snoop miss port 1 +-event:0xa3908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 pf sent with sys scope port 1 ++## note 1 event:0xb19080 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip and data from local node (local) (pred successful) ++## note 1 event:0xb29086 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node but data from out side local node (near or far or rem). Prediction too Low ++## note 1 event:0x218082 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses ++## note 1 event:0x54808c counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : l3 CO to L3.1 (lco) port 0 ++## note 1 event:0x538088 counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : l3 CO to memory port 0 ++## note 1 event:0x929084 counters:1 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 ++## note 1 event:0xa29084 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 pf sent with grp scope port 0 ++## note 1 event:0x528084 counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : lco sent with data port 0 ++## note 1 event:0x518080 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : dataless l3 lco sent port 0 ++## note 1 event:0xa4908c counters:3 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 LCO received retry port 0 ++## note 1 event:0xa19080 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 pf sent with nodal scope port 0 ++## note 1 event:0x919080 counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0 ++## note 1 event:0x939088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SN_HIT : L3 snoop hit port 0 ++## note 1 event:0x118080 counters:0 um:zero minimum:10000 name:PM_L3_P0_SN_INV : Port0 snooper detects someone doing a store to a line thats Sx ++## note 1 event:0x94908c counters:3 um:zero minimum:10000 name:PM_L3_P0_SN_MISS : L3 snoop miss port 0 ++## note 1 event:0xa39088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 pf sent with sys scope port 0 ++## note 1 event:0x54808e counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : l3 CO to L3.1 (lco) port 1 ++## note 1 event:0x53808a counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : l3 CO to memory port 1 ++## note 1 event:0x929086 counters:1 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 ++## note 1 event:0xa29086 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 pf sent with grp scope port 1 ++## note 1 event:0x528086 counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : lco sent with data port 1 ++## note 1 event:0x518082 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : dataless l3 lco sent port 1 ++## note 1 event:0xa4908e counters:3 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 LCO received retry port 1 ++## note 1 event:0xa19082 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 pf sent with nodal scope port 1 ++## note 1 event:0x919082 counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1 ++## note 1 event:0x93908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SN_HIT : L3 snoop hit port 1 ++## note 1 event:0x118082 counters:0 um:zero minimum:10000 name:PM_L3_P1_SN_INV : Port1 snooper detects someone doing a store to a line thats Sx ++## note 1 event:0x94908e counters:3 um:zero minimum:10000 name:PM_L3_P1_SN_MISS : L3 snoop miss port 1 ++## note 1 event:0xa3908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 pf sent with sys scope port 1 + event:0x84908d counters:3 um:zero minimum:10000 name:PM_L3_PF0_ALLOC : 0.0 + event:0x84908c counters:3 um:zero minimum:10000 name:PM_L3_PF0_BUSY : lifetime, sample of PF machine 0 valid +-event:0x428084 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : l3 pf hit in l3 ++## note 1 event:0x428084 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : l3 pf hit in l3 + event:0x18080 counters:0 um:zero minimum:10000 name:PM_L3_PF_MISS_L3 : L3 Prefetch missed in L3 + event:0x3808a counters:2 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_CACHE : L3 Prefetch from Off chip cache + event:0x4808e counters:3 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_MEM : L3 Prefetch from Off chip memory +@@ -567,12 +581,12 @@ event:0x839088 counters:2 um:zero minimum:10000 name:PM_L3_SN0_BUSY : lifetime, + event:0x819080 counters:0 um:zero minimum:10000 name:PM_L3_SN_USAGE : rotating sample of 8 snoop valids + event:0x2e052 counters:1 um:zero minimum:10000 name:PM_L3_ST_PREF : L3 store Prefetches. + event:0x3e052 counters:2 um:zero minimum:10000 name:PM_L3_SW_PREF : Data stream touchto L3. +-event:0xb29084 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system and data from outside group (far or rem)(pred successful) +-event:0xb4908c counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system but data from local or near. Predction too high +-event:0x24808e counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch ++## note 1 event:0xb29084 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system and data from outside group (far or rem)(pred successful) ++## note 1 event:0xb4908c counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system but data from local or near. Predction too high ++## note 1 event:0x24808e counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch + event:0x18081 counters:0 um:zero minimum:10000 name:PM_L3_WI0_ALLOC : 0.0 +-event:0x418080 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : lifetime, sample of Write Inject machine 0 valid +-event:0x418082 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : rotating sample of 8 WI actives ++## note 1 event:0x418080 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : lifetime, sample of Write Inject machine 0 valid ++## note 1 event:0x418082 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : rotating sample of 8 WI actives + event:0x3c058 counters:2 um:zero minimum:10000 name:PM_LARX_FIN : Larx finished . + event:0x1002e counters:0 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed. + event:0x10062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread. +@@ -853,7 +867,7 @@ event:0x3d15e counters:2 um:zero minimum:10000 name:PM_MULT_MRK : mult marked in + event:0x20b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_NESTED_TEND : Completion time nested tend + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. + event:0x20b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_NON_FAV_TBEGIN : Dispatch time non favored tbegin +-event:0x328084 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : non tm snp rst tm sc ++## note 1 event:0x328084 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : non tm snp rst tm sc + event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Ccycles after all instructions have finished to group completed. + event:0x20ac counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TBEGIN : Completion time outer tbegin + event:0x20ae counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TEND : Completion time outer tend +@@ -879,14 +893,14 @@ event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump predict + event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). + event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 + event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc +-event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 +-event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 +-event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++## note 1 event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++## note 1 event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++## note 1 event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++## note 1 event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc + event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running +-event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc +-event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc +-event:0x428086 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : rd machine hit l3 pf machine ++## note 1 event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc ++## note 1 event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc ++## note 1 event:0x428086 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : rd machine hit l3 pf machine + event:0x20004 counters:1 um:zero minimum:10000 name:PM_REAL_SRQ_FULL : Out of real srq entries. + event:0x3006c counters:2 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_MODE : Cycles run latch is set and core is in SMT2 mode. + event:0x2006a counters:1 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_SHRD_MODE : Cycles run latch is set and core is in SMT2-shared mode. +@@ -902,13 +916,13 @@ event:0x5090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_DISABLE : Sto + event:0x26085 counters:1 um:zero minimum:10000 name:PM_SN0_ALLOC : 0.0 + event:0x26084 counters:1 um:zero minimum:10000 name:PM_SN0_BUSY : SN mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) + event:0xd0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_SNOOP_TLBIE : TLBIE snoopSnoop TLBIE +-event:0x338088 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : snp tm st hit m mu +-event:0x33808a counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : snp tm_st_hit t tn te ++## note 1 event:0x338088 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : snp tm st hit m mu ++## note 1 event:0x33808a counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : snp tm_st_hit t tn te + event:0x4608c counters:3 um:zero minimum:10000 name:PM_SN_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 SN machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x10028 counters:0 um:zero minimum:10000 name:PM_STALL_END_GCT_EMPTY : Count ended because GCT went empty. + event:0x1e058 counters:0 um:zero minimum:10000 name:PM_STCX_FAIL : stcx failed . + event:0xc090 counters:0,1,2,3 um:zero minimum:10000 name:PM_STCX_LSU : STCX executed reported at sent to nest42 +-event:0x717080 counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non TM St caused any thread to fail ++## note 1 event:0x717080 counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non TM St caused any thread to fail + event:0x20016 counters:1 um:zero minimum:10000 name:PM_ST_CMPL : Store completion count. + event:0x20018 counters:1 um:zero minimum:10000 name:PM_ST_FWD : Store forwards that finished. + event:0x0 counters:0,1,2,3 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF. +@@ -941,8 +955,8 @@ event:0x4016e counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshol + event:0x30058 counters:2 um:zero minimum:10000 name:PM_TLBIE_FIN : tlbie finished. + event:0x20066 counters:1 um:zero minimum:10000 name:PM_TLB_MISS : TLB Miss (I + D). + event:0x20b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_BEGIN_ALL : Tm any tbegin +-event:0x318082 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : l3 tm cam overflow during L2 co of SC +-event:0x74708c counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capactiy Overflow ++## note 1 event:0x318082 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : l3 tm cam overflow during L2 co of SC ++## note 1 event:0x74708c counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capactiy Overflow + event:0x20ba counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_END_ALL : Tm any tend + event:0x3086 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CONF_NON_TM : TEXAS fail reason @ completion + event:0x3088 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CON_TM : TEXAS fail reason @ completion +@@ -952,13 +966,13 @@ event:0xe0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_NON_TX_CONFL + event:0x308a counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_SELF : TEXAS fail reason @ completion + event:0xe0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TLBIE : TLBIE hit bloom filter42 + event:0xe0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TX_CONFLICT : Transactional conflict from LSU, whatever gets reported to texas 42 +-event:0x727086 counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail +-event:0x717082 counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non TM Ld caused any thread to fail +-event:0x727084 counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) +-event:0x328086 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : tm snp rst tm sc +-event:0x318080 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : l3 castout tm Sc line +-event:0x73708a counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail +-event:0x737088 counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) ++## note 1 event:0x727086 counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail ++## note 1 event:0x717082 counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non TM Ld caused any thread to fail ++## note 1 event:0x727084 counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) ++## note 1 event:0x328086 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : tm snp rst tm sc ++## note 1 event:0x318080 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : l3 castout tm Sc line ++## note 1 event:0x73708a counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail ++## note 1 event:0x737088 counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) + event:0x20bc counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TBEGIN : Tm nested tbegin + event:0x10060 counters:0 um:zero minimum:10000 name:PM_TM_TRANS_RUN_CYC : run cycles in transactional state. + event:0x30060 counters:2 um:zero minimum:10000 name:PM_TM_TRANS_RUN_INST : Instructions completed in transactional state. +-- +2.7.4 + +From 54bd5569033f7ec395e47efc5264d95e48907475 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Thu, 19 Nov 2015 16:29:22 -0500 +Subject: [PATCH 03/18] Remove unused Power 8 unit masks + +To prevent people from using PMU events that the kernel would reject +on Power 8 commit 34715734fd6f commented out those events. However, +additional checks in oprofile code would note that some of the unit +masks were unused due to those commented out events and prevent +oprofile tools from running. The unused unit masks have been +commented out to pass these checks. + +Signed-off-by: William Cohen +--- + events/ppc64/power8/unit_masks | 25 +++++++++++++++++++++---- + 1 file changed, 21 insertions(+), 4 deletions(-) + +diff --git a/events/ppc64/power8/unit_masks b/events/ppc64/power8/unit_masks +index 203af97..96b32c0 100644 +--- a/events/ppc64/power8/unit_masks ++++ b/events/ppc64/power8/unit_masks +@@ -9,9 +9,26 @@ + # to workaround oprofile's 32-bit limitation for event codes. + # See libpe_utils/op_pe_utils.cpp:_get_event_code for how these codes are + # used. ++# ++#note 1. 11/12/2015 ++# ++# Some event requires the cache selector bits to be set to a non-zero ++# value in the processor performance counter setup register. On Power 8, this ++# register is only writable by the hypervisor. So the kernel must reject any ++# event where the lower three cache selector bits (bits 22:20) are not equal ++# to 0. If/when an API is implemented to allow the kernel to request the ++# hypervisor write the register with the required value, these events can be ++# re-added to the list of supported events. The issue is documented in the ++# powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() ++# where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If ++# cache bits are not zero, the function returns -1 to reject the event. ++# ++# The associated unit masks for these problem events are unused and also need ++# to be commented out. ++# + name:zero type:mandatory default:0x0 + 0x0 No unit mask +-name:rc_machine type:mandatory default:0xde +- 0xde Thresholdable start/stop for rc machine for sampled instruction +-name:L1_latency type:mandatory default:0x67 +- 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload ++## note 1 name:rc_machine type:mandatory default:0xde ++## note 1 0xde Thresholdable start/stop for rc machine for sampled instruction ++## note 1 name:L1_latency type:mandatory default:0x67 ++## note 1 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload +-- +2.7.4 + +From cfecfbfa3e5c76ab544f64946af38a7f2efec9a3 Mon Sep 17 00:00:00 2001 +From: "Carl E. Love" +Date: Mon, 14 Dec 2015 14:18:35 -0800 +Subject: [PATCH 05/18] Remove Powerpc OProfile events the kernel will reject + +Will, Rei: + +As I suspected in the last email, there is a second issue that has to do +with what version of the libpfm library OProfile is using. Initially a +subset of the OProfile events for Power 8 were added to libpfm4.5. +Later the complete set of events was added to libpfm4.6. So, My first +attempt at removing the events that perf was rejecting inadvertently +included events that were in libpfm 4.6 but not libpfm 4.5. My version +of oprofile was used a patched version of libpfm 4.4 that effectively +made it libpfm 4.5. I redid the patch an verified that when Oprofile is +built with libpfm 4.6 there are no rejected events. I added a comment +in the event file as a heads up to this effect. + +Please take a look at the patch and see if it works OK for you. Sorry +for the delay in getting this out. + + Carl Love + +---------------------------- + +Re-enable Power 8 events that the kernel does not reject. + +The previous patch to remove Power 8 events that were being rejected by +the kernel also removed events that were actually being rejected by +OProfile. OProfile was rejecting the events on the test machine because +the test machine used a version of libpfm that did not have all of the +Power 8 events. This patch re-enables the Power 8 events that are +not rejected by the kernel. + +Libpfm 4.5 only contains a subset of all the available Power 8 events. +The complete list of Power 8 events is supported by libpfm 4.6. To use +all of the events in this file, OProfile must be compiled with +the libpfm 4.6 library or newer. Otherwise, OProfile will reject the +event if it is not in the libpfm 4.5 library. + +Signed-off-by: Carl Love +--- + events/ppc64/power8/events | 111 +++++++++++++++++++++-------------------- + events/ppc64/power8/unit_masks | 24 ++------- + 2 files changed, 62 insertions(+), 73 deletions(-) + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 851299d..9a3c74e 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -1,4 +1,4 @@ +-# ++ + # Copyright OProfile authors + # Copyright (c) International Business Machines, 2013. + # Contributed by Maynard Johnson . +@@ -7,7 +7,7 @@ + + include:ppc64/architected_events_v1 + +-#note 1. 11/12/2015 ++# note 1. 11/12/2015 + # + # These event requires the cache selector bits to be set to a non-zero + # value in the processor performance counter setup register. On Power 8, this +@@ -19,7 +19,12 @@ include:ppc64/architected_events_v1 + # powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() + # where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If + # cache bits are not zero, the function returns -1 to reject the event. +- ++# ++# note 2. ++# ++# To use all of the events listed in this file, you must have OProfile ++# complied with the libpfm 4.6 or newer library. Libpfm 4.5 supports a ++# subset of these events. + + event:0x1f05e counters:0 um:zero minimum:100000 name:PM_1LPAR_CYC : Number of cycles in single lpar mode. + event:0x2006e counters:1 um:zero minimum:10000 name:PM_2LPAR_CYC : Number of cycles in 2 lpar mode. +@@ -107,45 +112,45 @@ event:0x16082 counters:0 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy + ## note 1 event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) + event:0x3608a counters:2 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + event:0x40066 counters:3 um:zero minimum:10000 name:PM_CRU_FIN : IFU Finished a (non-branch) instruction. +-## note 1 event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load +-## note 1 event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load ++event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 + event:0x62c040 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 +-## note 1 event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load +-## note 1 event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro +-## note 1 event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load +-## note 1 event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load +-## note 1 event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load +-## note 1 event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load +-## note 1 event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or +-## note 1 event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load ++event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load ++event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load ++event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load ++event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load ++event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load + event:0x1c050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load. + event:0x4c048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. + event:0x3c048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. +@@ -465,10 +470,10 @@ event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 IS + event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject + event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU + event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread +-## note1 event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc +-## note1 event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc +-## note1 event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc +-## note1 event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++event:0x200301ea counters:2 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++event:0x200101e8 counters:0 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++event:0x200201e6 counters:1 um:L1_latency minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc + event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) + event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . + event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 +@@ -512,15 +517,15 @@ event:0x17082 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS : All successf + event:0x37088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts + event:0x1e05e counters:0 um:zero minimum:10000 name:PM_L2_TM_REQ_ABORT : TM abort. + event:0x3e05c counters:2 um:zero minimum:10000 name:PM_L2_TM_ST_ABORT_SISTER : TM marked store abort. +-## note1 event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject +-## note1 event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count +-## note1 event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count ++## note 1 event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject ++## note 1 event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count ++## note 1 event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count + event:0x819082 counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : rotating sample of 16 CI or CO actives + ## note 1 event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) + event:0x83908b counters:2 um:zero minimum:10000 name:PM_L3_CO0_ALLOC : 0.0 + event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, sample of CO machine 0 valid + event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) +-## note1 event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO ++## note 1 event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO + event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) + event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough) + ## note 1 event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) +@@ -893,10 +898,10 @@ event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump predict + event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). + event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 + event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) +-## note 1 event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc +-## note 1 event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 +-## note 1 event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 +-## note 1 event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++event:0x200301ea counters:2 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x200101e8 counters:0 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++event:0x200201e6 counters:1 um:rc_machine minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc + event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running + ## note 1 event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc + ## note 1 event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc +diff --git a/events/ppc64/power8/unit_masks b/events/ppc64/power8/unit_masks +index 96b32c0..391f363 100644 +--- a/events/ppc64/power8/unit_masks ++++ b/events/ppc64/power8/unit_masks +@@ -10,25 +10,9 @@ + # See libpe_utils/op_pe_utils.cpp:_get_event_code for how these codes are + # used. + # +-#note 1. 11/12/2015 +-# +-# Some event requires the cache selector bits to be set to a non-zero +-# value in the processor performance counter setup register. On Power 8, this +-# register is only writable by the hypervisor. So the kernel must reject any +-# event where the lower three cache selector bits (bits 22:20) are not equal +-# to 0. If/when an API is implemented to allow the kernel to request the +-# hypervisor write the register with the required value, these events can be +-# re-added to the list of supported events. The issue is documented in the +-# powerpc kernel file arch/powerpc/perf/power8-pmu.c in function power8_get_constraint() +-# where the cache bits are ANDed with 0x7 if the unit is between 6 and 9. If +-# cache bits are not zero, the function returns -1 to reject the event. +-# +-# The associated unit masks for these problem events are unused and also need +-# to be commented out. +-# + name:zero type:mandatory default:0x0 + 0x0 No unit mask +-## note 1 name:rc_machine type:mandatory default:0xde +-## note 1 0xde Thresholdable start/stop for rc machine for sampled instruction +-## note 1 name:L1_latency type:mandatory default:0x67 +-## note 1 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload ++name:rc_machine type:mandatory default:0xde ++ 0xde Thresholdable start/stop for rc machine for sampled instruction ++name:L1_latency type:mandatory default:0x67 ++ 0x67 Thresholdable start/stop for L1 sampled instruction load miss/reload +-- +2.7.4 + +From 6fcd5aa57482a58fcb0166982fed517fbf7040fb Mon Sep 17 00:00:00 2001 +From: "Carl E. Love" +Date: Thu, 17 Mar 2016 13:49:41 -0700 +Subject: [PATCH 07/18] POWER 8 processor event spelling fixes + +Will: + +Here is a patch to fix the spelling errors in the Power 8 events. See +OProfile bugzilla number 281. This patch corrects the spelling errors. + + Carl Love +------------------------------------------------------------ + +POWER 8 processor event spelling fixes. + +Fixed the spelling of six of the events. + +Signed-off-by: Carl E. Love +--- + events/ppc64/power8/events | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 9a3c74e..b7f7ee2 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -455,13 +455,13 @@ event:0x3504a counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page + event:0x4608e counters:3 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : valid when first beat of data comes in for an i-side fetch where data came from mem(or L4) + ## note 1 event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch + event:0xd096 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : I SLB Miss. +-event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX0 : FX0 ISU reject +-event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX1 : FX1 ISU reject ++event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_FX0 : FX0 ISU reject ++event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_FX1 : FX1 ISU reject + event:0x38ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FXU : ISU +-event:0x30b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS0 : LS0 ISU reject +-event:0x30b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS1 : LS1 ISU reject +-event:0x30b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS2 : LS2 ISU reject +-event:0x30b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS3 : LS3 ISU reject ++event:0x30b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS0 : LS0 ISU reject ++event:0x30b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS1 : LS1 ISU reject ++event:0x30b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS2 : LS2 ISU reject ++event:0x30b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_LS3 : LS3 ISU reject + event:0x309c counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECTS_ALL : All isu rejects could be more than 1 per cycle + event:0x30a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_RES_NA : ISU reject due to resource not available + event:0x309e counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_SAR_BYPASS : Reject because of SAR bypass +-- +2.7.4 + diff --git a/SOURCES/oprofile-skylake.patch b/SOURCES/oprofile-skylake.patch new file mode 100644 index 0000000..014a4cb --- /dev/null +++ b/SOURCES/oprofile-skylake.patch @@ -0,0 +1,27 @@ +From 635d1f59ff198a43deb9482cdec10795222e506a Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 15 Apr 2016 13:14:51 -0700 +Subject: [PATCH 08/18] Add model number of Skylake server to oprofile + +Just reuse the event list of Skylake client. + +Signed-off-by: Andi Kleen +--- + libop/op_hw_specific.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 994fec4..a6180f4 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -156,6 +156,7 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + return CPU_BROADWELL; + case 0x4e: + case 0x5e: ++ case 0x55: + return CPU_SKYLAKE; + case 0x37: + case 0x4d: +-- +2.7.4 + diff --git a/SOURCES/oprofile-startup.patch b/SOURCES/oprofile-startup.patch new file mode 100644 index 0000000..5beb74c --- /dev/null +++ b/SOURCES/oprofile-startup.patch @@ -0,0 +1,59 @@ +From 249fe0a4bb69e5bd2e9ee0a0667d925a86d4337c Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Tue, 9 Aug 2016 22:25:52 -0400 +Subject: [PATCH 16/18] Only start the application if the perf events setup was + successful + +The code was starting the application before the performance events +were setup. In some cases the the setup of the perf events may fail +and the code needs to verify that the performance events have been +successfully set up before starting the application. Changed the +order of those steps to allow a check of the perf event setup before +launching the application. + +Signed-off-by: William Cohen +--- + pe_counting/ocount.cpp | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 4d9c104..7717717 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -257,16 +257,6 @@ bool start_counting(void) + proc_list = ocount_options::processes; + } + +- if (startApp) { +- // Tell app_PID to start the app +- cverb << vdebug << "telling child to start app" << endl; +- if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) { +- perror("Internal error on start_app_pipe"); +- return false; +- } +- app_started = true; +- } +- + orecord = new ocount_record(runmode, events, ocount_options::display_interval ? true : false); + bool ret; + switch (runmode) { +@@ -300,6 +290,16 @@ bool start_counting(void) + ret = false; + } + ++ if (startApp && ret != false) { ++ // Tell app_PID to start the app ++ cverb << vdebug << "telling child to start app" << endl; ++ if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) { ++ perror("Internal error on start_app_pipe"); ++ return false; ++ } ++ app_started = true; ++ } ++ + return ret; + } + +-- +2.7.4 + diff --git a/SOURCES/oprofile-xgene.patch b/SOURCES/oprofile-xgene.patch new file mode 100644 index 0000000..55df6c9 --- /dev/null +++ b/SOURCES/oprofile-xgene.patch @@ -0,0 +1,150 @@ +From 794258aba6c09e3de6d59e3d5977c543064b8c97 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Fri, 6 May 2016 15:40:10 -0400 +Subject: [PATCH 13/18] Additional X-Gene 1 performance events + +The initial OProfile X-Gene 1 support only had the ARMv8 generic +performance events. There are many additional microarchitecture +performance events listed for X-Gene 1 at: + +https://github.com/AppliedMicro/ENGLinuxLatest/blob/apm_linux_v3.17-rc4/Documentation/arm64/xgene_pmu.txt + +This patch adds those X-Gene 1 specific events. + +v2: Updated to exclude armv3 architected events not supported by X-Gene +Signed-off-by: William Cohen +--- + events/arm/armv8-xgene/events | 119 +++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 117 insertions(+), 2 deletions(-) + +diff --git a/events/arm/armv8-xgene/events b/events/arm/armv8-xgene/events +index 3e28463..1df573a 100644 +--- a/events/arm/armv8-xgene/events ++++ b/events/arm/armv8-xgene/events +@@ -2,6 +2,121 @@ + # Copyright (c) Red Hat, 2014. + # Contributed by William Cohen + # +-# Basic ARM V8 events ++# Applied Micro X-Gene events + # +-include:arm/armv8-pmuv3-common ++# The X-Gene processor excludes a few of the basic ARMv8 architected events. ++# Thus, need to explicitly list them rather than include ++# arm/armv8-pmuv3-common ++ ++# The basic ARMv8 architect events supported by X-Gene ++event:0x00 um:zero minimum:500 name:SW_INCR : Instruction architecturally executed, condition code check pass, software increment ++event:0x01 um:zero minimum:5000 name:L1I_CACHE_REFILL : Level 1 instruction cache refill ++event:0x02 um:zero minimum:5000 name:L1I_TLB_REFILL : Level 1 instruction TLB refill ++event:0x03 um:zero minimum:5000 name:L1D_CACHE_REFILL : Level 1 data cache refill ++event:0x04 um:zero minimum:5000 name:L1D_CACHE : Level 1 data cache access ++event:0x05 um:zero minimum:5000 name:L1D_TLB_REFILL : Level 1 data TLB refill ++# event:0x06 um:zero minimum:100000 name:LD_RETIRED : Instruction architecturally executed, condition code check pass, load ++# event:0x07 um:zero minimum:100000 name:ST_RETIRED : Instruction architecturally executed, condition code check pass, store ++event:0x08 um:zero minimum:100000 name:INST_RETIRED : Instruction architecturally executed ++event:0x09 um:zero minimum:500 name:EXC_TAKEN : Exception taken ++event:0x0A um:zero minimum:500 name:EXC_RETURN : Instruction architecturally executed, condition code check pass, exception return ++event:0x0B um:zero minimum:500 name:CID_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, write to CONTEXTIDR ++# event:0x0C um:zero minimum:5000 name:PC_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, software change of the PC ++# event:0x0D um:zero minimum:5000 name:BR_IMMED_RETIRED : Instruction architecturally executed, immediate branch ++# event:0x0E um:zero minimum:5000 name:BR_RETURN_RETIRED : Instruction architecturally executed, condition code check pass, procedure return ++# event:0x0F um:zero minimum:500 name:UNALIGNED_LDST_RETIRED : Instruction architecturally executed, condition code check pass, unaligned load or store ++event:0x10 um:zero minimum:5000 name:BR_MIS_PRED : Mispredicted or not predicted branch speculatively executed ++event:0x11 um:zero minimum:100000 name:CPU_CYCLES : Cycle ++event:0x12 um:zero minimum:5000 name:BR_PRED : Predictable branch speculatively executed ++event:0x13 um:zero minimum:100000 name:MEM_ACCESS : Data memory access ++event:0x14 um:zero minimum:5000 name:L1I_CACHE : Level 1 instruction cache access ++# event:0x15 um:zero minimum:5000 name:L1D_CACHE_WB : Level 1 data cache write-back ++event:0x16 um:zero minimum:5000 name:L2D_CACHE : Level 2 data cache access ++event:0x17 um:zero minimum:5000 name:L2D_CACHE_REFILL : Level 2 data cache refill ++event:0x18 um:zero minimum:5000 name:L2D_CACHE_WB : Level 2 data cache write-back ++event:0x19 um:zero minimum:5000 name:BUS_ACCESS : Bus access ++event:0x1A um:zero minimum:500 name:MEMORY_ERROR : Local memory error ++event:0x1B um:zero minimum:100000 name:INST_SPEC : Operation speculatively executed ++event:0x1C um:zero minimum:5000 name:TTBR_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, write to TTBR ++# event:0x1D um:zero minimum:5000 name:BUS_CYCLES : Bus cycle ++# event:0x1F um:zero minimum:5000 name:L1D_CACHE_ALLOCATE : Level 1 data cache allocation without refill ++# event:0x20 um:zero minimum:5000 name:L2D_CACHE_ALLOCATE : Level 2 data cache allocation without refill ++# X-Gene specific events ++event:0x040 um:zero minimum:10007 name:L1D_CACHE_LD : L1 data cache access - Read ++event:0x041 um:zero minimum:10007 name:L1D_CACHE_ST : L1 data cache access - Write ++event:0x042 um:zero minimum:10007 name:L1D_CACHE_REFILL_LD : L1 data cache refill - Read ++event:0x048 um:zero minimum:10007 name:L1D_CACHE_INVAL : L1 data cache invalidate ++event:0x04C um:zero minimum:10007 name:L1D_TLB_REFILL_LD : L1 data TLB refill - Read ++event:0x04D um:zero minimum:10007 name:L1D_TLB_REFILL_ST : L1 data TLB refill - Write ++event:0x050 um:zero minimum:10007 name:L2D_CACHE_LD : L2 data cache access - Read ++event:0x051 um:zero minimum:10007 name:L2D_CACHE_ST : L2 data cache access - Write ++event:0x052 um:zero minimum:10007 name:L2D_CACHE_REFILL_LD : L2 data cache refill - Read ++event:0x053 um:zero minimum:10007 name:L2D_CACHE_REFILL_ST : L2 data cache refill - Write ++event:0x056 um:zero minimum:10007 name:L2D_CACHE_WB_VICTIM : L2 data cache write-back - victim ++event:0x057 um:zero minimum:10007 name:L2D_CACHE_WB_CLEAN : L2 data cache write-back - Cleaning and coherency ++event:0x058 um:zero minimum:10007 name:L2D_CACHE_INVAL : L2 data cache invalidate ++event:0x060 um:zero minimum:10007 name:BUS_ACCESS_LD : Bus access - Read ++event:0x061 um:zero minimum:10007 name:BUS_ACCESS_ST : Bus access - Write ++event:0x062 um:zero minimum:10007 name:BUS_ACCESS_SHARED : Bus access - Normal, cacheable, sharable ++event:0x063 um:zero minimum:10007 name:BUS_ACCESS_NOT_SHARED : Bus access - Not normal, cacheable, sharable ++event:0x064 um:zero minimum:10007 name:BUS_ACCESS_NORMAL : Bus access - Normal ++event:0x065 um:zero minimum:10007 name:BUS_ACCESS_PERIPH : Bus access - Peripheral ++event:0x066 um:zero minimum:10007 name:MEM_ACCESS_LD : Data memory access - Read ++event:0x067 um:zero minimum:10007 name:MEM_ACCESS_ST : Data memory access - write ++event:0x068 um:zero minimum:10007 name:UNALIGNED_LD_SPEC : Unaligned access - Read ++event:0x069 um:zero minimum:10007 name:UNALIGNED_ST_SPEC : Unaligned access - Write ++event:0x06A um:zero minimum:10007 name:UNALIGNED_LDST_SPEC : Unaligned access ++event:0x06C um:zero minimum:10007 name:LDREX_SPEC : Exclusive operation speculatively executed - Load exclusive ++event:0x06D um:zero minimum:10007 name:STREX_PASS_SPEC : Exclusive operation speculative executed - Store exclusive pass ++event:0x06E um:zero minimum:10007 name:STREX_FAIL_SPEC : Exclusive operation speculative executed - Store exclusive fail ++event:0x06F um:zero minimum:10007 name:STREX_SPEC : Exclusive operation speculatively executed - Store exclusive ++event:0x070 um:zero minimum:10007 name:LD_SPEC : Operation speculatively executed - Load ++event:0x071 um:zero minimum:10007 name:ST_SPEC : Operation speculatively executed - Store ++event:0x072 um:zero minimum:10007 name:LDST_SPEC : Operation speculatively executed - Load or store ++event:0x073 um:zero minimum:10007 name:DP_SPEC : Operation speculatively executed - Integer data processing ++event:0x074 um:zero minimum:10007 name:ASE_SPEC : Operation speculatively executed - Advanced SIMD ++event:0x075 um:zero minimum:10007 name:VFP_SPEC : Operation speculatively executed - FP ++event:0x076 um:zero minimum:10007 name:PC_WRITE_SPEC : Operation speculatively executed - Software change of PC ++event:0x078 um:zero minimum:10007 name:BR_IMMED_SPEC : Branch speculative executed - Immediate branch ++event:0x079 um:zero minimum:10007 name:BR_RETURN_SPEC : Branch speculative executed - Procedure return ++event:0x07A um:zero minimum:10007 name:BR_INDIRECT_SPEC : Branch speculative executed - Indirect branch ++event:0x07C um:zero minimum:10007 name:ISB_SPEC : Barrier speculatively executed - ISB ++event:0x07D um:zero minimum:10007 name:DSB_SPEC : Barrier speculatively executed - DSB ++event:0x07E um:zero minimum:10007 name:DMB_SPEC : Barrier speculatively executed - DMB ++event:0x081 um:zero minimum:10007 name:EXC_UNDEF : Exception taken, other synchronous ++event:0x082 um:zero minimum:10007 name:EXC_SVC : Exception taken, Supervisor Call ++event:0x083 um:zero minimum:10007 name:EXC_PABORT : Exception taken, Instruction Abort ++event:0x084 um:zero minimum:10007 name:EXC_DABORT : Exception taken, Data Abort or SError ++event:0x086 um:zero minimum:10007 name:EXC_IRQ : Exception taken, IRQ ++event:0x087 um:zero minimum:10007 name:EXC_FIQ : Exception taken, FIQ ++event:0x08A um:zero minimum:10007 name:EXC_HVC : Exception taken, Hypervisor Call ++event:0x08B um:zero minimum:10007 name:EXC_TRAP_PABORT : Exception taken, Instruction Abort not taken locally ++event:0x08C um:zero minimum:10007 name:EXC_TRAP_DABORT : Exception taken, Data Abort or SError not taken locally ++event:0x08D um:zero minimum:10007 name:EXC_TRAP_OTHER : Exception taken, other traps not taken locally ++event:0x08E um:zero minimum:10007 name:EXC_TRAP_IRQ : Exception taken, IRQ not taken locally ++event:0x08F um:zero minimum:10007 name:EXC_TRAP_FIQ : Exception taken, FIQ not taken locally ++event:0x090 um:zero minimum:10007 name:RC_LD_SPEC : Release consistency instruction speculatively executed - Load Acquire ++event:0x091 um:zero minimum:10007 name:RC_ST_SPEC : Release consistency instruction speculatively executed - Store Release ++event:0x100 um:zero minimum:10007 name:NOP_SPEC : Operation speculatively executed - NOP ++event:0x101 um:zero minimum:10007 name:FSU_CLOCK_OFF_CYCLES : FSU clocking gated off cycle ++event:0x102 um:zero minimum:10007 name:BTB_MIS_PRED : BTB misprediction ++event:0x103 um:zero minimum:10007 name:ITB_MISS : ITB miss ++event:0x104 um:zero minimum:10007 name:DTB_MISS : DTB miss ++event:0x105 um:zero minimum:10007 name:L1D_CACHE_LATE_MISS : L1 data cache late miss ++event:0x106 um:zero minimum:10007 name:L1D_CACHE_PREFETCH : L1 data cache prefetch request ++event:0x107 um:zero minimum:10007 name:L2D_CACHE_PREFETCH : L2 data prefetch request ++event:0x108 um:zero minimum:10007 name:DECODE_STALL : Decode starved for instruction cycle ++event:0x109 um:zero minimum:10007 name:DISPATCH_STALL : Op dispatch stalled cycle ++event:0x10A um:zero minimum:10007 name:IXA_STALL : IXA Op non-issue ++event:0x10B um:zero minimum:10007 name:IXB_STALL : IXB Op non-issue ++event:0x10C um:zero minimum:10007 name:BX_STALL : BX Op non-issue ++event:0x10D um:zero minimum:10007 name:LX_STALL : LX Op non-issue ++event:0x10E um:zero minimum:10007 name:SX_STALL : SX Op non-issue ++event:0x10F um:zero minimum:10007 name:FX_STALL : FX Op non-issue ++event:0x110 um:zero minimum:10007 name:WAIT_CYCLES : Wait state cycle ++event:0x111 um:zero minimum:10007 name:L1_STAGE2_TLB_REFILL : L1 stage-2 TLB refill ++event:0x112 um:zero minimum:10007 name:PAGE_WALK_L0_STAGE1_HIT : Page Walk Cache level-0 stage-1 hit ++event:0x113 um:zero minimum:10007 name:PAGE_WALK_L1_STAGE1_HIT : Page Walk Cache level-1 stage-1 hit ++event:0x114 um:zero minimum:10007 name:PAGE_WALK_L2_STAGE1_HIT : Page Walk Cache level-2 stage-1 hit ++event:0x115 um:zero minimum:10007 name:PAGE_WALK_L1_STAGE2_HIT : Page Walk Cache level-1 stage-2 hit ++event:0x116 um:zero minimum:10007 name:PAGE_WALK_L2_STAGE2_HIT : Page Walk Cache level-2 stage-2 hit +-- +2.7.4 + diff --git a/SOURCES/oprofile-zseries.patch b/SOURCES/oprofile-zseries.patch new file mode 100644 index 0000000..bb1c3b8 --- /dev/null +++ b/SOURCES/oprofile-zseries.patch @@ -0,0 +1,142 @@ +From 966ef44b8c049b135773f1c6b71ab35b265e6356 Mon Sep 17 00:00:00 2001 +From: Andreas Arnez +Date: Mon, 23 May 2016 16:51:04 +0200 +Subject: [PATCH 18/18] s390: Add support for z13 + +On Wed, May 11 2016, Andreas Arnez wrote: + +> So far oprofile supported z Systems (s390) machines up to zEC12. This +> adds support for z13 as well. +> +> Signed-off-by: Andreas Arnez +> --- +> events/Makefile.am | 3 ++- +> libop/op_cpu_type.c | 3 +++ +> libop/op_cpu_type.h | 1 + +> libop/op_events.c | 1 + +> utils/ophelp.c | 1 + +> 5 files changed, 8 insertions(+), 1 deletion(-) + +Oops, that patch lacked the files in the new directory +"events/s390/z13". Corrected version below. Is this OK? + +-- >8 -- +Subject: [PATCH] s390: Add support for z13 + +So far oprofile supported z Systems (s390) machines up to zEC12. This +adds support for z13 as well. + +Signed-off-by: Andreas Arnez +--- + events/Makefile.am | 3 ++- + events/s390/z13/events | 8 ++++++++ + events/s390/z13/unit_masks | 7 +++++++ + libop/op_cpu_type.c | 3 +++ + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + utils/ophelp.c | 1 + + 7 files changed, 23 insertions(+), 1 deletion(-) + create mode 100644 events/s390/z13/events + create mode 100644 events/s390/z13/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index 677b05f..29d4b5f 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -81,7 +81,8 @@ event_files = \ + tile/tilegx/events tile/tilegx/unit_masks \ + s390/z10/events s390/z10/unit_masks \ + s390/z196/events s390/z196/unit_masks \ +- s390/zEC12/events s390/zEC12/unit_masks ++ s390/zEC12/events s390/zEC12/unit_masks \ ++ s390/z13/events s390/z13/unit_masks + + install-data-local: + for i in ${event_files} ; do \ +diff --git a/events/s390/z13/events b/events/s390/z13/events +new file mode 100644 +index 0000000..313f5b0 +--- /dev/null ++++ b/events/s390/z13/events +@@ -0,0 +1,8 @@ ++# Copyright OProfile authors ++# Copyright (c) International Business Machines, 2016. ++# Contributed by Andreas Arnez . ++# ++# IBM Enterprise z13 events for operf/ocount ++# ++event:0x00 counters:0 um:zero minimum:19264 name:CPU_CYCLES : Processor cycles ++event:0x01 counters:0 um:zero minimum:19264 name:INSTRUCTIONS : Instructions completed +diff --git a/events/s390/z13/unit_masks b/events/s390/z13/unit_masks +new file mode 100644 +index 0000000..4cf2842 +--- /dev/null ++++ b/events/s390/z13/unit_masks +@@ -0,0 +1,7 @@ ++# Copyright OProfile authors ++# Copyright (c) International Business Machines, 2016. ++# Contributed by Andreas Arnez . ++# ++# S/390 Basic Mode Hardware Sampling unit masks ++# ++include:s390/z10 +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 7bdde53..e70e4f6 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -123,6 +123,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, + { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, ++ { "IBM z13", "s390/z13", CPU_S390_Z13, 1 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -680,6 +681,8 @@ static op_cpu _get_s390_cpu_type(void) + case 2827: + case 2828: + return CPU_S390_ZEC12; ++ case 2964: ++ return CPU_S390_Z13; + } + return CPU_NO_GOOD; + } +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 98289c5..4f896a0 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -103,6 +103,7 @@ typedef enum { + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ + CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ ++ CPU_S390_Z13, /** < IBM z13 */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index cdd0409..ea6ced3 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1307,6 +1307,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_S390_Z10: + case CPU_S390_Z196: + case CPU_S390_ZEC12: ++ case CPU_S390_Z13: + descr->name = "CPU_CYCLES"; + descr->count = 4127518; + break; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 5821593..3cb1c08 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -779,6 +779,7 @@ int main(int argc, char const * argv[]) + case CPU_S390_Z10: + case CPU_S390_Z196: + case CPU_S390_ZEC12: ++ case CPU_S390_Z13: + event_doc = "IBM System z CPU Measurement Facility\n" + "http://www-01.ibm.com/support/docview.wss" + "?uid=isg26fcd1cc32246f4c8852574ce0044734a\n"; +-- +2.7.4 + diff --git a/SPECS/oprofile.spec b/SPECS/oprofile.spec new file mode 100644 index 0000000..74fb9ce --- /dev/null +++ b/SPECS/oprofile.spec @@ -0,0 +1,222 @@ +%{?scl:%scl_package oprofile} + +Summary: System wide profiler +Name: %{?scl_prefix}oprofile +Version: 1.1.0 +Release: 4%{?dist} +License: GPLv2+ and LGPLv2+ +Group: Development/System +# +Source0: http://downloads.sourceforge.net/oprofile/oprofile-%{version}.tar.gz +#FIXME a workaround until java-1.6.0-openjdk-devel is available on all archs +Source1: openjdk-include.tar.gz +Requires: binutils +Requires: which +Requires(pre): shadow-utils +Requires(postun): shadow-utils +%{?scl:Requires:%scl_runtime} +Patch1010: oprofile-bz1335142.patch +Patch1011: oprofile-power.patch +Patch1012: oprofile-skylake.patch +Patch1013: oprofile-goldmont.patch +Patch1014: oprofile-kabylake.patch +Patch1015: oprofile-xgene.patch +Patch1016: oprofile-oparchive.patch +Patch1017: oprofile-startup.patch +Patch1018: oprofile-zseries.patch + +URL: http://oprofile.sf.net + +#If oprofile doesn't build on an arch, report it and will add ExcludeArch tag. +BuildRequires: qt-devel +BuildRequires: libxslt +BuildRequires: docbook-style-xsl +BuildRequires: docbook-utils +BuildRequires: elinks +BuildRequires: gtk2-devel +BuildRequires: automake +BuildRequires: libtool +%if 0%{?rhel} >= 7 || 0%{?fedora} >= 15 +BuildRequires: binutils-static +BuildRequires: libpfm-devel >= 4.3.0 +%else +BuildRequires: %{?scl_prefix}binutils-devel +BuildRequires: binutils-devel +%endif +%if 0%{?rhel} == 6 +BuildRequires: papi-devel +%endif +%if 0%{?rhel} >= 6 +BuildRequires: popt-devel +%else +BuildRequires: popt +%endif + +#BuildRequires: java-devel +#BuildRequires: jpackage-utils +#BuildRequires: java-1.6.0-openjdk-devel + +BuildRoot: %{_tmppath}/%{name}-root + +%description +OProfile is a profiling system for systems running Linux. The +profiling runs transparently during the background, and profile data +can be collected at any time. OProfile makes use of the hardware performance +counters provided on Intel P6, and AMD Athlon family processors, and can use +the RTC for profiling on other x86 processor types. + +See the HTML documentation for further details. + +%package devel +Summary: Header files and libraries for developing apps which will use oprofile +Group: Development/Libraries +Requires: %{?scl_prefix}oprofile = %{version}-%{release} +Provides: %{?scl_prefix}oprofile-static = %{version}-%{release} + +%description devel + +Header files and libraries for developing apps which will use oprofile. + +%package jit +Summary: Libraries required for profiling Java and other JITed code +Group: Development/System +Requires: %{?scl_prefix}oprofile = %{version}-%{release} +#Requires: java >= 1.6 +#Requires: jpackage-utils +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +Requires: /etc/ld.so.conf.d + +%description jit +This package includes a base JIT support library, as well as a Java +agent library. + +%prep +%setup -q -n oprofile-%{version} -a1 +%patch1010 -p1 +%patch1011 -p1 +%patch1012 -p1 +%patch1013 -p1 +%patch1014 -p1 +%patch1015 -p1 +%patch1016 -p1 +%patch1017 -p1 +%patch1018 -p1 + + +./autogen.sh + +%build + +%configure \ +--with-java=`pwd`/java-1.6.0-openjdk-1.6.0.0 + +make CFLAGS="$RPM_OPT_FLAGS" + +#tweak the manual pages +find -path "*/doc/*.1" -exec \ + sed -i -e \ + 's,/doc/oprofile/,/doc/oprofile-%{version}/,g' {} \; + +%install +rm -rf %{buildroot} + +mkdir -p %{buildroot}%{_bindir} +mkdir -p %{buildroot}%{_mandir}/man1 + +make DESTDIR=%{buildroot} INSTALL="install -p" install + +# We want the manuals in the special doc dir, not the generic doc install dir. +# We build it in place and then move it away so it doesn't get installed +# twice. rpm can specify itself where the (versioned) docs go with the +# %%doc directive. +mkdir docs.installed +mv %{buildroot}%{_datadir}/doc/oprofile/* docs.installed/ + +%if 0%{?scl:1} +# if developer tools set need to put the agentlib in an appropriate place +mv %{buildroot}%{_libdir}/oprofile/* %{buildroot}%{_libdir} +rmdir %{buildroot}%{_libdir}/oprofile +%else +mkdir -p %{buildroot}%{_sysconfdir}/ld.so.conf.d +echo "%{_libdir}/oprofile" > %{buildroot}%{_sysconfdir}/ld.so.conf.d/oprofile-%{_arch}.conf +%endif + + +%pre +getent group oprofile >/dev/null || groupadd -r -g 16 oprofile +getent passwd oprofile >/dev/null || \ +useradd -g oprofile -d /var/lib/oprofile -M -r -u 16 -s /sbin/nologin \ + -c "Special user account to be used by OProfile" oprofile +exit 0 + +%postun +# do not try to remove existing oprofile user or group + +%files +%defattr(-,root,root) +%doc docs.installed/* +%doc COPYING + +%{_bindir}/* + +%{_mandir}/man1/* + +%{_datadir}/oprofile + +%files devel +%defattr(-,root,root) + +%{_includedir}/opagent.h + +%post jit -p /sbin/ldconfig + +%postun jit -p /sbin/ldconfig + +%files jit +%defattr(-,root,root) + +%if 0%{?scl:1} +%{_libdir}/* +%else +%{_libdir}/oprofile +%{_sysconfdir}/ld.so.conf.d/* +%endif + +%changelog +* Wed Oct 12 2016 Will Cohen - 1.1.0-4 +- Update events non-x86 architectures (aarch64, power, zseries) +- Add support for newer Intel processors. + +* Thu Sep 15 2016 Will Cohen - 1.1.0-3 +- Avoid duplicate event names for Nehalem and Westmere processors. + +* Thu Aug 13 2015 Will Cohen - 1.1.0-2 +- Locate jvm agent libjvmti in a LD_LIBRARY_PATH directory. + +* Tue Jul 21 2015 Will Cohen - 1.1.0-1 +- Rebase to oprofile-1.1.0. + +* Thu Apr 23 2015 Will Cohen - 0.9.9-7 +- LLC_REFS and LLC_MISSES do not work on some CPUs. +- incorrect handling of default unit masks longer than 11 characters +- Oprofile updates for Avoton +- Unable to profile jited JVM code when using static huge pages +- operf causes rpmbuild to fail + +* Wed Sep 17 2014 Will Cohen - 0.9.9-6 +- Update support for Intel Silvermont (Avoton). +- Enable configure for ppc64le. + +* Mon Aug 18 2014 Will Cohen - 0.9.9-5 +- Update Intel Haswell events. +- Add support for Intel Silvermont (Avoton). +- Add support for Intel Broadwell. +- Add support for aarch64. +- Update IBM power8 events. + +* Wed May 28 2014 Will Cohen - 0.9.9-2.1 +- Correct xml output. + +* Fri May 16 2014 Will Cohen - 0.9.9-1.1 +- Rebase on oprofile-0.9.9.