From 22587ba192138d52db37c11a53cc1f273e5b9f98 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Mar 05 2015 13:16:06 +0000 Subject: import oprofile-0.9.9-7.el7 --- diff --git a/SOURCES/oprofile-aarch64.patch b/SOURCES/oprofile-aarch64.patch new file mode 100644 index 0000000..fec25f6 --- /dev/null +++ b/SOURCES/oprofile-aarch64.patch @@ -0,0 +1,677 @@ +commit 34d0065a1a790fc2be05a5ef1d8b0bbf28b814fe +Author: William Cohen +Date: Wed Feb 12 08:05:38 2014 -0600 + + Provide basic AArch64 (ARMv8) support + + The AArch64 (ARMv8) support is provided as an ARM variant to allow use + in both 32-bit and 64-bit ARM environments. The support in this patch + is just the basic events described in the AArch64 documentation. + AArch64 processor implementation may provide additional implementation + specific events. One could add code to recognize those processor + specific implementations and include the armv8-pmuv3-common base + events into the event sets for the processor implementations. + The APM X-Gene processor type is included in this patch as an + implementation, although there are no known processor-specific events + to add at this time. + + Below is example run on the ARM Foundation simulator collecting data + on a build of OProfile. + + $ cd oprofile + $ operf make + ... + $ opreport -t 5 + Using /home/wcohen/oprofile/oprofile/oprofile_data/samples/ for samples directory. + + WARNING: Lost samples detected! See /home/wcohen/oprofile/oprofile/oprofile_data/samples/operf.log for details. + CPU: ARM AArch64 + Counted CPU_CYCLES events (Cycle) with a unit mask of 0x00 (No unit mask) count 100000 + CPU_CYCLES:100000| + samples| %| + ------------------ + 10943 90.5877 make + CPU_CYCLES:100000| + samples| %| + ------------------ + 5281 48.2592 make + 4543 41.5151 libc-2.17.so + 1079 9.8602 kallsyms + 40 0.3655 ld-2.17.so + 735 6.0844 sh + CPU_CYCLES:100000| + samples| %| + ------------------ + 321 43.6735 kallsyms + 298 40.5442 libc-2.17.so + 94 12.7891 bash + 22 2.9932 ld-2.17.so + + Signed-off-by: William Cohen + +diff --git a/events/Makefile.am b/events/Makefile.am +index ad45642..3e43d10 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -59,6 +59,8 @@ event_files = \ + arm/armv7-ca7/events arm/armv7-ca7/unit_masks \ + arm/armv7-ca15/events arm/armv7-ca15/unit_masks \ + arm/mpcore/events arm/mpcore/unit_masks \ ++ arm/armv8-pmuv3-common/events arm/armv8-pmuv3-common/unit_masks \ ++ arm/armv8-xgene/events arm/armv8-xgene/unit_masks \ + avr32/events avr32/unit_masks \ + mips/20K/events mips/20K/unit_masks \ + mips/24K/events mips/24K/unit_masks \ +diff --git a/events/arm/armv8-pmuv3-common/events b/events/arm/armv8-pmuv3-common/events +new file mode 100644 +index 0000000..3cdff03 +--- /dev/null ++++ b/events/arm/armv8-pmuv3-common/events +@@ -0,0 +1,38 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# ARMv8 pmu v3 architected events ++ ++event:0x00 um:zero minimum:500 name:SW_INCR : Instruction architecturally executed, condition code check pass, software increment ++event:0x01 um:zero minimum:5000 name:L1I_CACHE_REFILL : Level 1 instruction cache refill ++event:0x02 um:zero minimum:5000 name:L1I_TLB_REFILL : Level 1 instruction TLB refill ++event:0x03 um:zero minimum:5000 name:L1D_CACHE_REFILL : Level 1 data cache refill ++event:0x04 um:zero minimum:5000 name:L1D_CACHE : Level 1 data cache access ++event:0x05 um:zero minimum:5000 name:L1D_TLB_REFILL : Level 1 data TLB refill ++event:0x06 um:zero minimum:100000 name:LD_RETIRED : Instruction architecturally executed, condition code check pass, load ++event:0x07 um:zero minimum:100000 name:ST_RETIRED : Instruction architecturally executed, condition code check pass, store ++event:0x08 um:zero minimum:100000 name:INST_RETIRED : Instruction architecturally executed ++event:0x09 um:zero minimum:500 name:EXC_TAKEN : Exception taken ++event:0x0A um:zero minimum:500 name:EXC_RETURN : Instruction architecturally executed, condition code check pass, exception return ++event:0x0B um:zero minimum:500 name:CID_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, write to CONTEXTIDR ++event:0x0C um:zero minimum:5000 name:PC_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, software change of the PC ++event:0x0D um:zero minimum:5000 name:BR_IMMED_RETIRED : Instruction architecturally executed, immediate branch ++event:0x0E um:zero minimum:5000 name:BR_RETURN_RETIRED : Instruction architecturally executed, condition code check pass, procedure return ++event:0x0F um:zero minimum:500 name:UNALIGNED_LDST_RETIRED : Instruction architecturally executed, condition code check pass, unaligned load or store ++event:0x10 um:zero minimum:5000 name:BR_MIS_PRED : Mispredicted or not predicted branch speculatively executed ++event:0x11 um:zero minimum:100000 name:CPU_CYCLES : Cycle ++event:0x12 um:zero minimum:5000 name:BR_PRED : Predictable branch speculatively executed ++event:0x13 um:zero minimum:100000 name:MEM_ACCESS : Data memory access ++event:0x14 um:zero minimum:5000 name:L1I_CACHE : Level 1 instruction cache access ++event:0x15 um:zero minimum:5000 name:L1D_CACHE_WB : Level 1 data cache write-back ++event:0x16 um:zero minimum:5000 name:L2D_CACHE : Level 2 data cache access ++event:0x17 um:zero minimum:5000 name:L2D_CACHE_REFILL : Level 2 data cache refill ++event:0x18 um:zero minimum:5000 name:L2D_CACHE_WB : Level 2 data cache write-back ++event:0x19 um:zero minimum:5000 name:BUS_ACCESS : Bus access ++event:0x1A um:zero minimum:500 name:MEMORY_ERROR : Local memory error ++event:0x1B um:zero minimum:100000 name:INST_SPEC : Operation speculatively executed ++event:0x1C um:zero minimum:5000 name:TTBR_WRITE_RETIRED : Instruction architecturally executed, condition code check pass, write to TTBR ++event:0x1D um:zero minimum:5000 name:BUS_CYCLES : Bus cycle ++event:0x1F um:zero minimum:5000 name:L1D_CACHE_ALLOCATE : Level 1 data cache allocation without refill ++event:0x20 um:zero minimum:5000 name:L2D_CACHE_ALLOCATE : Level 2 data cache allocation without refill +diff --git a/events/arm/armv8-pmuv3-common/unit_masks b/events/arm/armv8-pmuv3-common/unit_masks +new file mode 100644 +index 0000000..7666c35 +--- /dev/null ++++ b/events/arm/armv8-pmuv3-common/unit_masks +@@ -0,0 +1,4 @@ ++# ARMv8 architected events unit masks ++# ++name:zero type:mandatory default:0x00 ++ 0x00 No unit mask +diff --git a/events/arm/armv8-xgene/events b/events/arm/armv8-xgene/events +new file mode 100644 +index 0000000..3e28463 +--- /dev/null ++++ b/events/arm/armv8-xgene/events +@@ -0,0 +1,7 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# Basic ARM V8 events ++# ++include:arm/armv8-pmuv3-common +diff --git a/events/arm/armv8-xgene/unit_masks b/events/arm/armv8-xgene/unit_masks +new file mode 100644 +index 0000000..9ace2eb +--- /dev/null ++++ b/events/arm/armv8-xgene/unit_masks +@@ -0,0 +1,3 @@ ++# ARMv8 architected events unit masks ++# ++include:arm/armv8-pmuv3-common +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 1ae2913..0cfb4ea 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -129,6 +129,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ppc64 POWER8", "ppc64/power8", CPU_PPC64_POWER8, 6 }, + { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, + { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, ++ { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -395,6 +396,11 @@ static op_cpu _get_arm_cpu_type(void) + case 0xc0f: + return op_get_cpu_number("arm/armv7-ca15"); + } ++ } else if (vendorid == 0x50) { /* Applied Micro Circuits Corporation */ ++ switch (cpuid) { ++ case 0x000: ++ return op_get_cpu_number("arm/armv8-xgene"); ++ } + } else if (vendorid == 0x69) { /* Intel xscale */ + switch (cpuid >> 9) { + case 1: +@@ -631,7 +637,8 @@ static op_cpu __get_cpu_type_alt_method(void) + if (strncmp(uname_info.machine, "ppc64", 5) == 0) { + return _get_ppc64_cpu_type(); + } +- if (strncmp(uname_info.machine, "arm", 3) == 0) { ++ if (strncmp(uname_info.machine, "arm", 3) == 0 || ++ strncmp(uname_info.machine, "aarch64", 7) == 0) { + return _get_arm_cpu_type(); + } + if (strncmp(uname_info.machine, "tile", 4) == 0) { +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 67e16de..7c478ad 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -109,6 +109,7 @@ typedef enum { + CPU_PPC64_POWER8, /**< ppc64 POWER8 family */ + CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ + CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ ++ CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 358a154..e0d3ed5 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1253,6 +1253,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_AVR32: + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: ++ case CPU_ARM_V8_XGENE: + descr->name = "CPU_CYCLES"; + break; + +diff --git a/utils/opcontrol b/utils/opcontrol +index 38bb1ac..04a4a91 100755 +--- a/utils/opcontrol ++++ b/utils/opcontrol +@@ -400,6 +400,11 @@ do_init() + do_deinit + exit 1 + ;; ++ aarch64/*) ++ echo "*** ARM AArch64 processors are not supported with opcontrol. Please use operf instead. ***" ++ do_deinit ++ exit 1 ++ ;; + esac + fi + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index af4c1e5..35f47bc 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -656,6 +656,13 @@ int main(int argc, char const * argv[]) + "Cortex A15 DDI (ARM DDI 0438F, revision r3p1)\n"; + break; + ++ case CPU_ARM_V8_APM_XGENE: ++ event_doc = ++ "See ARM Architecture Reference Manual \n" ++ "ARMv8, for ARMv8-A architecture profile\n" ++ "DDI (ARM DDI0487A.a)\n"; ++ break; ++ + case CPU_PPC64_PA6T: + event_doc = + "See PA6T Power Implementation Features Book IV\n" + + +commit a5eec42a9324915947e78634ddcce55b159a5dd2 +Author: Maynard Johnson +Date: Wed Feb 12 08:29:15 2014 -0600 + + Minor fixup for previous commit + + The previous commit for the new APM X-Gene (AaArch64 ARMv8) + processor went through a number of iterations before acceptance. + I missed changing one of the references to the new CPU type + from CPU_ARM_V8_XGENE to CPU_ARM_V8_APM_XGENE when I committed it. + This patch fixes that. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_events.c b/libop/op_events.c +index e0d3ed5..77fc8a5 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1253,7 +1253,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_AVR32: + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: +- case CPU_ARM_V8_XGENE: ++ case CPU_ARM_V8_APM_XGENE + descr->name = "CPU_CYCLES"; + break; + +commit c4e390042458aee07016da0cab251b0ad67b8d2b +Author: William Cohen +Date: Wed Feb 12 11:56:39 2014 -0500 + + Add missing ':' on case statement for CPU_ARM_V8_APM_XGENE + +diff --git a/libop/op_events.c b/libop/op_events.c +index 77fc8a5..968ff04 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1253,7 +1253,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_AVR32: + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: +- case CPU_ARM_V8_APM_XGENE ++ case CPU_ARM_V8_APM_XGENE: + descr->name = "CPU_CYCLES"; + break; + +From 40adac210cf9ac8d79a90609c91b8ee5e05b8a2f Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Mon, 21 Jul 2014 14:36:23 -0400 +Subject: [PATCH 1/2] Add oprofile support for ARM Cortex A57 microarchitecture + +This patch adds the event list of the ARM Cortex A57 architecture. + +The patch is very straight forward: just add the model numbers and +type in the usual places and add the event list. + +Passes make check + +Signed-off-by: William Cohen +--- + events/Makefile.am | 1 + + events/arm/armv8-ca57/events | 67 ++++++++++++++++++++++++++++++++++++++++ + events/arm/armv8-ca57/unit_masks | 3 ++ + libop/op_cpu_type.c | 3 ++ + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + utils/ophelp.c | 6 ++++ + 7 files changed, 82 insertions(+) + create mode 100644 events/arm/armv8-ca57/events + create mode 100644 events/arm/armv8-ca57/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index f6fd3d7..b4bca1e 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -62,6 +62,7 @@ event_files = \ + arm/mpcore/events arm/mpcore/unit_masks \ + arm/armv8-pmuv3-common/events arm/armv8-pmuv3-common/unit_masks \ + arm/armv8-xgene/events arm/armv8-xgene/unit_masks \ ++ arm/armv8-ca57/events arm/armv8-ca57/unit_masks \ + avr32/events avr32/unit_masks \ + mips/20K/events mips/20K/unit_masks \ + mips/24K/events mips/24K/unit_masks \ +diff --git a/events/arm/armv8-ca57/events b/events/arm/armv8-ca57/events +new file mode 100644 +index 0000000..62974c1 +--- /dev/null ++++ b/events/arm/armv8-ca57/events +@@ -0,0 +1,67 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# ARM Cortex A57 events ++# From Cortex A57 TRM ++# ++include:arm/armv8-pmuv3-common ++event:0x40 um:zero minimum:10007 name:L1D_CACHE_LD : Level 1 data cache access - Read ++event:0x41 um:zero minimum:10007 name:L1D_CACHE_ST : Level 1 data cache access - Write ++event:0x42 um:zero minimum:10007 name:L1D_CACHE_REFILL_LD : Level 1 data cache refill - Read ++event:0x43 um:zero minimum:10007 name:L1D_CACHE_REFILL_ST : Level 1 data cache refill - Write ++event:0x46 um:zero minimum:10007 name:L1D_CACHE_WB_VICTIM : Level 1 data cache Write-back - Victim ++event:0x47 um:zero minimum:10007 name:L1D_CACHE_WB_CLEAN : Level 1 data cache Write-back - Cleaning event:and coherency ++event:0x48 um:zero minimum:10007 name:L1D_CACHE_INVAL : Level 1 data cache invalidate ++event:0x4C um:zero minimum:10007 name:L1D_TLB_REFILL_LD : Level 1 data TLB refill - Read ++event:0x4D um:zero minimum:10007 name:L1D_TLB_REFILL_ST : Level 1 data TLB refill - Write ++event:0x50 um:zero minimum:10007 name:L2D_CACHE_LD : Level 2 data cache access - Read ++event:0x51 um:zero minimum:10007 name:L2D_CACHE_ST : Level 2 data cache access - Write ++event:0x52 um:zero minimum:10007 name:L2D_CACHE_REFILL_LD : Level 2 data cache refill - Read ++event:0x53 um:zero minimum:10007 name:L2D_CACHE_REFILL_ST : Level 2 data cache refill - Write ++event:0x56 um:zero minimum:10007 name:L2D_CACHE_WB_VICTIM : Level 2 data cache Write-back - Victim ++event:0x57 um:zero minimum:10007 name:L2D_CACHE_WB_CLEAN : Level 2 data cache Write-back - Cleaning and coherency ++event:0x58 um:zero minimum:10007 name:L2D_CACHE_INVAL : Level 2 data cache invalidate ++event:0x60 um:zero minimum:10007 name:BUS_ACCESS_LD : Bus access - Read ++event:0x61 um:zero minimum:10007 name:BUS_ACCESS_ST : Bus access - Write ++event:0x62 um:zero minimum:10007 name:BUS_ACCESS_SHARED : Bus access - Normal ++event:0x63 um:zero minimum:10007 name:BUS_ACCESS_NOT_SHARED : Bus access - Not normal ++event:0x64 um:zero minimum:10007 name:BUS_ACCESS_NORMAL : Bus access - Normal ++event:0x65 um:zero minimum:10007 name:BUS_ACCESS_PERIPH : Bus access - Peripheral ++event:0x66 um:zero minimum:10007 name:MEM_ACCESS_LD : Data memory access - Read ++event:0x67 um:zero minimum:10007 name:MEM_ACCESS_ST : Data memory access - Write ++event:0x68 um:zero minimum:10007 name:UNALIGNED_LD_SPEC : Unaligned access - Read ++event:0x69 um:zero minimum:10007 name:UNALIGNED_ST_SPEC : Unaligned access - Write ++event:0x6A um:zero minimum:10007 name:UNALIGNED_LDST_SPEC : Unaligned access ++event:0x6C um:zero minimum:10007 name:LDREX_SPEC : Exclusive operation speculatively executed - LDREX ++event:0x6D um:zero minimum:10007 name:STREX_PASS_SPEC : Exclusive instruction speculatively executed - STREX pass ++event:0x6E um:zero minimum:10007 name:STREX_FAIL_SPEC : Exclusive operation speculatively executed - STREX fail ++event:0x70 um:zero minimum:10007 name:LD_SPEC : Operation speculatively executed - Load ++event:0x71 um:zero minimum:10007 name:ST_SPEC : Operation speculatively executed - Store ++event:0x72 um:zero minimum:10007 name:LDST_SPEC : Operation speculatively executed - Load or store ++event:0x73 um:zero minimum:10007 name:DP_SPEC : Operation speculatively executed - Integer data processing ++event:0x74 um:zero minimum:10007 name:ASE_SPEC : Operation speculatively executed - Advanced SIMD ++event:0x75 um:zero minimum:10007 name:VFP_SPEC : Operation speculatively executed - VFP ++event:0x76 um:zero minimum:10007 name:PC_WRITE_SPEC : Operation speculatively executed - Software change of the PC ++event:0x77 um:zero minimum:10007 name:CRYPTO_SPEC : Operation speculatively executed, crypto data processing ++event:0x78 um:zero minimum:10007 name:BR_IMMED_SPEC : Branch speculatively executed - Immediate branch ++event:0x79 um:zero minimum:10007 name:BR_RETURN_SPEC : Branch speculatively executed - Procedure return ++event:0x7A um:zero minimum:10007 name:BR_INDIRECT_SPEC : Branch speculatively executed - Indirect branch ++event:0x7C um:zero minimum:10007 name:ISB_SPEC : Barrier speculatively executed - ISB ++event:0x7D um:zero minimum:10007 name:DSB_SPEC : Barrier speculatively executed - DSB ++event:0x7E um:zero minimum:10007 name:DMB_SPEC : Barrier speculatively executed - DMB ++event:0x81 um:zero minimum:10007 name:EXC_UNDEF : Exception taken, other synchronous ++event:0x82 um:zero minimum:10007 name:EXC_SVC : Exception taken, Supervisor Call ++event:0x83 um:zero minimum:10007 name:EXC_PABORT : Exception taken, Instruction Abort ++event:0x84 um:zero minimum:10007 name:EXC_DABORT : Exception taken, Data Abort or SError ++event:0x86 um:zero minimum:10007 name:EXC_IRQ : Exception taken, IRQ ++event:0x87 um:zero minimum:10007 name:EXC_FIQ : Exception taken, FIQ ++event:0x88 um:zero minimum:10007 name:EXC_SMC : Exception taken, Secure Monitor Call ++event:0x8A um:zero minimum:10007 name:EXC_HVC : Exception taken, Hypervisor Call ++event:0x8B um:zero minimum:10007 name:EXC_TRAP_PABORT : Exception taken, Instruction Abort not taken locally ++event:0x8C um:zero minimum:10007 name:EXC_TRAP_DABORT : Exception taken, Data Abort, or SError not taken locally ++event:0x8D um:zero minimum:10007 name:EXC_TRAP_OTHER : Exception taken – Other traps not taken locally ++event:0x8E um:zero minimum:10007 name:EXC_TRAP_IRQ : Exception taken, IRQ not taken locally ++event:0x8F um:zero minimum:10007 name:EXC_TRAP_FIQ : Exception taken, FIQ not taken locally ++event:0x90 um:zero minimum:10007 name:RC_LD_SPEC : Release consistency instruction speculatively executed – Load-Acquire ++event:0x91 um:zero minimum:10007 name:RC_ST_SPEC : Release consistency instruction speculatively executed – Store-Release +diff --git a/events/arm/armv8-ca57/unit_masks b/events/arm/armv8-ca57/unit_masks +new file mode 100644 +index 0000000..5d69263 +--- /dev/null ++++ b/events/arm/armv8-ca57/unit_masks +@@ -0,0 +1,3 @@ ++# ARMv8 Cortex A57 unit masks ++# ++include:arm/armv8-pmuv3-common +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index bce230a..163bd1c 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -131,6 +131,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, + { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, + { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, ++ { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -396,6 +397,8 @@ static op_cpu _get_arm_cpu_type(void) + return op_get_cpu_number("arm/armv7-ca9"); + case 0xc0f: + return op_get_cpu_number("arm/armv7-ca15"); ++ case 0xd07: ++ return op_get_cpu_number("arm/armv8-ca57"); + } + } else if (vendorid == 0x50) { /* Applied Micro Circuits Corporation */ + switch (cpuid) { +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 3754156..aebd7f6 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -111,6 +111,7 @@ typedef enum { + CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ + CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ + CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ ++ CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index b8900a5..d5249b7 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1255,6 +1255,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_ARM_SCORPION: + case CPU_ARM_SCORPIONMP: + case CPU_ARM_V8_APM_XGENE: ++ case CPU_ARM_V8_CA57: + descr->name = "CPU_CYCLES"; + break; + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index bf3fbcb..a5edf56 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -664,6 +664,12 @@ int main(int argc, char const * argv[]) + "DDI (ARM DDI0487A.a)\n"; + break; + ++ case CPU_ARM_V8_CA57: ++ event_doc = ++ "See Cortex-A57 MPCore Technical Reference Manual\n" ++ "Cortex A57 DDI (ARM DDI 0488D, revision r1p1)\n"; ++ break; ++ + case CPU_PPC64_PA6T: + event_doc = + "See PA6T Power Implementation Features Book IV\n" +-- +1.9.3 + +From 78db0d3eb65e6005931b0402484e759c35df79f1 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Wed, 23 Jul 2014 23:25:21 -0400 +Subject: [PATCH] Add oprofile support for ARM Cortex A53 microarchitecture + +This patch adds the event list of the ARM Cortex A53 architecture. + +The patch is very straight forward: just add the model numbers and +type in the usual places and add the event list. + +Passes make check + +Signed-off-by: William Cohen +--- + events/Makefile.am | 1 + + events/arm/armv8-ca53/events | 38 ++++++++++++++++++++++++++++++++++++++ + events/arm/armv8-ca53/unit_masks | 3 +++ + libop/op_cpu_type.c | 3 +++ + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + utils/ophelp.c | 6 ++++++ + 7 files changed, 53 insertions(+) + create mode 100644 events/arm/armv8-ca53/events + create mode 100644 events/arm/armv8-ca53/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index b4bca1e..67be125 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -63,6 +63,7 @@ event_files = \ + arm/armv8-pmuv3-common/events arm/armv8-pmuv3-common/unit_masks \ + arm/armv8-xgene/events arm/armv8-xgene/unit_masks \ + arm/armv8-ca57/events arm/armv8-ca57/unit_masks \ ++ arm/armv8-ca53/events arm/armv8-ca53/unit_masks \ + avr32/events avr32/unit_masks \ + mips/20K/events mips/20K/unit_masks \ + mips/24K/events mips/24K/unit_masks \ +diff --git a/events/arm/armv8-ca53/events b/events/arm/armv8-ca53/events +new file mode 100644 +index 0000000..5e1b4d8 +--- /dev/null ++++ b/events/arm/armv8-ca53/events +@@ -0,0 +1,38 @@ ++# ++# Copyright (c) Red Hat, 2014. ++# Contributed by William Cohen ++# ++# ARM Cortex A53 events ++# From Cortex A53 TRM ++# ++include:arm/armv8-pmuv3-common ++event:0x60 um:zero minimum:10007 name:BUS_ACCESS_LD : Bus access - Read ++event:0x61 um:zero minimum:10007 name:BUS_ACCESS_ST : Bus access - Write ++event:0x7A um:zero minimum:10007 name:BR_INDIRECT_SPEC : Branch speculatively executed - Indirect branch ++event:0x86 um:zero minimum:10007 name:EXC_IRQ : Exception taken, IRQ ++event:0x87 um:zero minimum:10007 name:EXC_FIQ : Exception taken, FIQ ++event:0xC0 um:zero minimum:10007 name:EXT_MEM_REQ : External memory request ++event:0xC1 um:zero minimum:10007 name:EXT_MEM_REQ_NC : Non-cacheable external memory request ++event:0xC2 um:zero minimum:10007 name:PREFETCH_LINEFILL : Linefill because of prefetch ++event:0xC3 um:zero minimum:10007 name:PREFETCH_LINEFILL_DROP : Instruction Cache Throttle occurred ++event:0xC4 um:zero minimum:10007 name:READ_ALLOC_ENTER : Entering read allocate mode ++event:0xC5 um:zero minimum:10007 name:READ_ALLOC : Read allocate mode ++event:0xC6 um:zero minimum:10007 name:PRE_DECODE_ERR : Pre-decode error ++event:0xC7 um:zero minimum:10007 name:STALL_SB_FULL : Data Write operation that stalls the pipeline because the store buffer is full ++event:0xC8 um:zero minimum:10007 name:EXT_SNOOP : SCU Snooped data from another CPU for this CPU ++event:0xC9 um:zero minimum:10007 name:BR_COND : Conditional branch executed ++event:0xCA um:zero minimum:10007 name:BR_INDIRECT_MISPRED : Indirect branch mispredicted ++event:0xCB um:zero minimum:10007 name:BR_INDIRECT_MISPRED_ADDR : Indirect branch mispredicted because of address miscompare ++event:0xCC um:zero minimum:10007 name:BR_COND_MISPRED : Conditional branch mispredicted ++event:0xD0 um:zero minimum:10007 name:L1I_CACHE_ERR : L1 Instruction Cache (data or tag) memory error ++event:0xD1 um:zero minimum:10007 name:L1D_CACHE_ERR : L1 Data Cache (data, tag or dirty) memory error, correctable or non-correctable ++event:0xD2 um:zero minimum:10007 name:TLB_ERR : TLB memory error ++event:0xE0 um:zero minimum:10007 name:OTHER_IQ_DEP_STALL : Cycles that the DPU IQ is empty and that is not because of a recent micro-TLB miss, instruction cache miss or pre-decode error ++event:0xE1 um:zero minimum:10007 name:IC_DEP_STALL : Cycles the DPU IQ is empty and there is an instruction cache miss being processed ++event:0xE2 um:zero minimum:10007 name:IUTLB_DEP_STALL : Cycles the DPU IQ is empty and there is an instruction micro-TLB miss being processed ++event:0xE3 um:zero minimum:10007 name:DECODE_DEP_STALL : Cycles the DPU IQ is empty and there is a pre-decode error being processed ++event:0xE4 um:zero minimum:10007 name:OTHER_INTERLOCK_STALL : Cycles there is an interlock other than Advanced SIMD/Floating-point instructions or load/store instruction ++event:0xE5 um:zero minimum:10007 name:AGU_DEP_STALL : Cycles there is an interlock for a load/store instruction waiting for data to calculate the address in the AGU ++event:0xE6 um:zero minimum:10007 name:SIMD_DEP_STALL : Cycles there is an interlock for an Advanced SIMD/Floating-point operation. ++event:0xE7 um:zero minimum:10007 name:LD_DEP_STALL : Cycles there is a stall in the Wr stage because of a load miss ++event:0xE8 um:zero minimum:10007 name:ST_DEP_STALL : Cycles there is a stall in the Wr stage because of a store +diff --git a/events/arm/armv8-ca53/unit_masks b/events/arm/armv8-ca53/unit_masks +new file mode 100644 +index 0000000..42b12b4 +--- /dev/null ++++ b/events/arm/armv8-ca53/unit_masks +@@ -0,0 +1,3 @@ ++# ARMv8 Cortex A53 unit masks ++# ++include:arm/armv8-pmuv3-common +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 163bd1c..055c64b 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -132,6 +132,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, + { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, ++ { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -399,6 +400,8 @@ static op_cpu _get_arm_cpu_type(void) + return op_get_cpu_number("arm/armv7-ca15"); + case 0xd07: + return op_get_cpu_number("arm/armv8-ca57"); ++ case 0xd03: ++ return op_get_cpu_number("arm/armv8-ca53"); + } + } else if (vendorid == 0x50) { /* Applied Micro Circuits Corporation */ + switch (cpuid) { +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index aebd7f6..a6bb323 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -112,6 +112,7 @@ typedef enum { + CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ + CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ ++ CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index d5249b7..bbeb212 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1256,6 +1256,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_ARM_SCORPIONMP: + case CPU_ARM_V8_APM_XGENE: + case CPU_ARM_V8_CA57: ++ case CPU_ARM_V8_CA53: + descr->name = "CPU_CYCLES"; + break; + +diff --git a/utils/ophelp.c b/utils/ophelp.c +index a5edf56..980c6dc 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -670,6 +670,12 @@ int main(int argc, char const * argv[]) + "Cortex A57 DDI (ARM DDI 0488D, revision r1p1)\n"; + break; + ++ case CPU_ARM_V8_CA53: ++ event_doc = ++ "See Cortex-A53 MPCore Technical Reference Manual\n" ++ "Cortex A57 DDI (ARM DDI 0500D, revision r0p2)\n"; ++ break; ++ + case CPU_PPC64_PA6T: + event_doc = + "See PA6T Power Implementation Features Book IV\n" +-- +1.9.3 + +From 76464b279cf20bb0bb40e758afb32eaf4195d861 Mon Sep 17 00:00:00 2001 +From: Maynard Johnson +Date: Fri, 1 Aug 2014 09:06:17 -0500 +Subject: [PATCH 1/2] Add another ARM internal mapping symbol to ignore + +Ignore "$x" symbols, which can show up as internal +mapping symbols in binaries built on Aarch64. + +Reported-byP: Andrew Haley +Signed-off-by: Maynard Johnson +--- + libutil++/bfd_support.cpp | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/libutil++/bfd_support.cpp b/libutil++/bfd_support.cpp +index a3bee99..0554616 100644 +--- a/libutil++/bfd_support.cpp ++++ b/libutil++/bfd_support.cpp +@@ -475,7 +475,8 @@ bool interesting_symbol(asymbol * sym) + /* ARM assembler internal mapping symbols aren't interesting */ + if ((strcmp("$a", sym->name) == 0) || + (strcmp("$t", sym->name) == 0) || +- (strcmp("$d", sym->name) == 0)) ++ (strcmp("$d", sym->name) == 0))|| ++ (strcmp("$x", sym->name) == 0)) + return false; + + // C++ exception stuff +-- +1.9.3 + +From a4bdbc9ce94b15df3d19d60a11e4c4f2fc729cd9 Mon Sep 17 00:00:00 2001 +From: Maynard Johnson +Date: Fri, 1 Aug 2014 09:25:55 -0500 +Subject: [PATCH 2/2] Fix mis-placed parentheses in previous commit that caused + build error + +Signed-off-by: Maynard Johnson +--- + libutil++/bfd_support.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libutil++/bfd_support.cpp b/libutil++/bfd_support.cpp +index 0554616..d5fd70d 100644 +--- a/libutil++/bfd_support.cpp ++++ b/libutil++/bfd_support.cpp +@@ -475,7 +475,7 @@ bool interesting_symbol(asymbol * sym) + /* ARM assembler internal mapping symbols aren't interesting */ + if ((strcmp("$a", sym->name) == 0) || + (strcmp("$t", sym->name) == 0) || +- (strcmp("$d", sym->name) == 0))|| ++ (strcmp("$d", sym->name) == 0) || + (strcmp("$x", sym->name) == 0)) + return false; + +-- +1.9.3 + diff --git a/SOURCES/oprofile-broadwell.patch b/SOURCES/oprofile-broadwell.patch new file mode 100644 index 0000000..c328b12 --- /dev/null +++ b/SOURCES/oprofile-broadwell.patch @@ -0,0 +1,853 @@ +commit 6d692179cb44e68a3cfaeac213e3244f858676b8 +Author: Andi Kleen +Date: Wed Jul 16 08:03:54 2014 -0500 + + Add oprofile support for Broadwell microarchitecture + + This patch adds the event list of the Intel Broadwell architecture. + Hopefully this can still make 1.0 + + The patch is very straight forward: just add the model numbers and + type in the usual places and add the event list. + + Passes make check + + Some notes: + - Haswell included one Broadwell model number by mistake. I moved + that to Broadwell now. + - oprofile doesn't support umask sub events with different counter + constraints than other events. This affects a few events on Broadwell. + However it's not a problem when oprofile uses perf as a backend, + as perf will know how to schedule these events (once it gets the + Broadwell support). It won't work correctly with the old driver. + Most of these events are not too useful for sampling, so in practice + it's not a real problem. + - As usual PEBS events and events with offcore mask and uncore + events are missing. + + Signed-off-by: Andi Kleen + +diff --git a/events/Makefile.am b/events/Makefile.am +index 3e43d10..f6fd3d7 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -21,6 +21,7 @@ event_files = \ + i386/sandybridge/events i386/sandybridge/unit_masks \ + i386/ivybridge/events i386/ivybridge/unit_masks \ + i386/haswell/events i386/haswell/unit_masks \ ++ i386/broadwell/events i386/broadwell/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ +diff --git a/events/i386/broadwell/events b/events/i386/broadwell/events +new file mode 100644 +index 0000000..6a4b388 +--- /dev/null ++++ b/events/i386/broadwell/events +@@ -0,0 +1,65 @@ ++# ++# Intel "Broadwell" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Broadwell based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++include:i386/arch_perfmon ++event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : ++event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000003 name:misalign_mem_ref : ++event:0x07 counters:cpuid um:one minimum:100003 name:ld_blocks_partial_address_alias : ++event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : ++event:0x0d counters:cpuid um:x03 minimum:2000003 name:int_misc_recovery_cycles : ++event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : ++event:0x14 counters:cpuid um:one minimum:2000003 name:arith_fpu_div_active : ++event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : ++event:0x27 counters:cpuid um:x50 minimum:200003 name:l2_demand_rqsts_wb_hit : ++event:0x48 counters:2 um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : ++event:0x49 counters:cpuid um:dtlb_store_misses minimum:100003 name:dtlb_store_misses : ++event:0x4c counters:cpuid um:x02 minimum:100003 name:load_hit_pre_hw_pf : ++event:0x4f counters:cpuid um:x10 minimum:2000003 name:ept_walk_cycles : ++event:0x51 counters:cpuid um:one minimum:2000003 name:l1d_replacement : ++event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : ++event:0x58 counters:cpuid um:move_elimination minimum:1000003 name:move_elimination : ++event:0x5c counters:cpuid um:cpl_cycles minimum:2000003 name:cpl_cycles : ++event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : ++event:0x5e counters:cpuid um:rs_events minimum:2000003 name:rs_events : ++event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000003 name:offcore_requests_outstanding : ++event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles : ++event:0x79 counters:0,1,2,3 um:idq minimum:2000003 name:idq : ++event:0x80 counters:cpuid um:x02 minimum:200003 name:icache_misses : ++event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : ++event:0x87 counters:cpuid um:one minimum:2000003 name:ild_stall_lcp : ++event:0x88 counters:cpuid um:br_inst_exec minimum:200003 name:br_inst_exec : ++event:0x89 counters:cpuid um:br_misp_exec minimum:200003 name:br_misp_exec : ++event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : ++event:0xa1 counters:cpuid um:uops_executed_port minimum:2000003 name:uops_executed_port : ++event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000003 name:uops_dispatched_port : ++event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : ++event:0xa3 counters:2 um:cycle_activity minimum:2000003 name:cycle_activity : ++event:0xa8 counters:cpuid um:lsd minimum:2000003 name:lsd : ++event:0xab counters:cpuid um:x02 minimum:2000003 name:dsb2mite_switches_penalty_cycles : ++event:0xae counters:cpuid um:one minimum:100007 name:itlb_itlb_flush : ++event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : ++event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : ++event:0xbc counters:0,1,2,3 um:page_walker_loads minimum:2000003 name:page_walker_loads : ++event:0xc0 counters:1 um:inst_retired minimum:2000003 name:inst_retired : ++event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:2000003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : ++event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : ++event:0xc9 counters:0,1,2,3 um:rtm_retired minimum:2000003 name:rtm_retired : ++event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist : ++event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_inserts : ++event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : ++event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : ++event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : ++event:0xd3 counters:0,1,2,3 um:one minimum:100007 name:mem_load_uops_l3_miss_retired_local_dram : ++event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : ++event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : ++event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : ++event:0xf2 counters:cpuid um:x05 minimum:100003 name:l2_lines_out_demand_clean : +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +new file mode 100644 +index 0000000..470e9e9 +--- /dev/null ++++ b/events/i386/broadwell/unit_masks +@@ -0,0 +1,316 @@ ++# ++# Unit masks for the Intel "Broadwell" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Broadwell based CPUs ++# ++include:i386/arch_perfmon ++name:x02 type:mandatory default:0x2 ++ 0x2 No unit mask ++name:x03 type:mandatory default:0x3 ++ 0x3 No unit mask ++name:x05 type:mandatory default:0x5 ++ 0x5 No unit mask ++name:x10 type:mandatory default:0x10 ++ 0x10 No unit mask ++name:x1f type:mandatory default:0x1f ++ 0x1f No unit mask ++name:x20 type:mandatory default:0x20 ++ 0x20 No unit mask ++name:x50 type:mandatory default:0x50 ++ 0x50 No unit mask ++name:ld_blocks type:exclusive default:0x2 ++ 0x2 extra: store_forward This event counts how many times the load operation got the true Block-on-Store blocking code preventing store forwarding. This includes cases when: - preceding store conflicts with the load (incomplete overlap); - store forwarding is impossible due to u-arch limitations; - preceding lock RMW operations are not forwarded; - store has the no-forward bit set (uncacheable/page-split/masked stores); - all-blocking stores are used (mostly, fences and port I/O); and others. The most common case is a load blocked due to its address range overlapping with a preceding smaller uncompleted store. Note: This event does not take into account cases of out-of-SW-control (for example, SbTailHit), unknown physical STA, and cases of blocking loads on store due to being non-WB memory type or a lock. These cases are covered by other events. See the table of not supported store forwards in the Optimization Guide. ++ 0x8 extra: no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. ++name:misalign_mem_ref type:exclusive default:0x1 ++ 0x1 extra: loads This event counts speculative cache-line split load uops dispatched to the L1 cache. ++ 0x2 extra: stores This event counts speculative cache line split store-address (STA) uops dispatched to the L1 cache. ++name:dtlb_load_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk This event counts load misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). ++ 0x2 extra: walk_completed_4k This event counts load misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. ++ 0x10 extra: walk_duration This event counts the number of cycles while PMH is busy with the page walk. ++ 0x20 extra: stlb_hit_4k Load misses that miss the DTLB and hit the STLB (4K) ++ 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. ++ 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks ++name:uops_issued type:exclusive default:0x1 ++ 0x1 extra: any This event counts the number of Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS). ++ 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. ++ 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. ++ 0x40 extra: single_mul Number of Multiply packed/scalar single precision uops allocated ++ 0x1 extra:inv stall_cycles This event counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread. ++name:l2_rqsts type:exclusive default:0x21 ++ 0x21 extra: demand_data_rd_miss This event counts the number of demand Data Read requests that miss L2 cache. Only not rejected loads are counted. ++ 0x41 extra: demand_data_rd_hit This event counts the number of demand Data Read requests that hit L2 cache. Only not rejected loads are counted. ++ 0x30 extra: l2_pf_miss This event counts the number of requests from the L2 hardware prefetchers that miss L2 cache. ++ 0x50 extra: l2_pf_hit This event counts the number of requests from the L2 hardware prefetchers that hit L2 cache. L3 prefetch new types ++ 0xe1 extra: all_demand_data_rd This event counts the number of demand Data Read requests (including requests from L1D hardware prefetchers). These loads may hit or miss L2 cache. Only non rejected loads are counted. ++ 0xe2 extra: all_rfo This event counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. ++ 0xe4 extra: all_code_rd This event counts the total number of L2 code requests. ++ 0xf8 extra: all_pf This event counts the total number of requests from the L2 hardware prefetchers. ++ 0x42 extra: rfo_hit RFO requests that hit L2 cache ++ 0x22 extra: rfo_miss RFO requests that miss L2 cache ++ 0x44 extra: code_rd_hit L2 cache hits when fetching instructions, code reads. ++ 0x24 extra: code_rd_miss L2 cache misses when fetching instructions ++ 0x27 extra: all_demand_miss Demand requests that miss L2 cache ++ 0xe7 extra: all_demand_references Demand requests to L2 cache ++ 0x3f extra: miss All requests that miss L2 cache ++ 0xff extra: references All L2 requests ++name:l1d_pend_miss type:exclusive default:0x1 ++ 0x1 extra: pending This event counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand; from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. ++ 0x1 extra: pending_cycles This event counts duration of L1D miss outstanding in cycles. ++name:dtlb_store_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). ++ 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. ++ 0x10 extra: walk_duration This event counts the number of cycles while PMH is busy with the page walk. ++ 0x20 extra: stlb_hit_4k Store misses that miss the DTLB and hit the STLB (4K) ++ 0xe extra: walk_completed Store misses in all DTLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Store operations that miss the first TLB level but hit the second and do not cause page walks ++name:tx_mem type:exclusive default:0x1 ++ 0x1 extra: abort_conflict Number of times a TSX line had a cache conflict ++ 0x2 extra: abort_capacity_write Number of times a TSX Abort was triggered due to an evicted line caused by a transaction overflow ++ 0x4 extra: abort_hle_store_to_elided_lock Number of times a TSX Abort was triggered due to a non-release/commit store to lock ++ 0x8 extra: abort_hle_elision_buffer_not_empty Number of times a TSX Abort was triggered due to commit but Lock Buffer not empty ++ 0x10 extra: abort_hle_elision_buffer_mismatch Number of times a TSX Abort was triggered due to release/commit but data and address mismatch ++ 0x20 extra: abort_hle_elision_buffer_unsupported_alignment Number of times a TSX Abort was triggered due to attempting an unsupported alignment from Lock Buffer ++ 0x40 extra: hle_elision_buffer_full Number of times we could not allocate Lock Buffer ++name:move_elimination type:exclusive default:0x1 ++ 0x1 extra: int_eliminated Number of integer Move Elimination candidate uops that were eliminated. ++ 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. ++ 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. ++ 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. ++name:cpl_cycles type:exclusive default:0x1 ++ 0x1 extra: ring0 This event counts the unhalted core cycles during which the thread is in the ring 0 privileged mode. ++ 0x2 extra: ring123 This event counts unhalted core cycles during which the thread is in rings 1, 2, or 3. ++ 0x1 extra:edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. ++name:tx_exec type:exclusive default:0x1 ++ 0x1 extra: misc1 Unfriendly TSX abort triggered by a flowmarker ++ 0x2 extra: misc2 Unfriendly TSX abort triggered by a vzeroupper instruction ++ 0x4 extra: misc3 Unfriendly TSX abort triggered by a nest count that is too deep ++ 0x8 extra: misc4 RTM region detected inside HLE ++ 0x10 extra: misc5 # HLE inside HLE+ ++name:rs_events type:exclusive default:0x1 ++ 0x1 extra: empty_cycles This event counts cycles during which the reservation station (RS) is empty for the thread. Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues. ++ 0x1 extra:inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++name:offcore_requests_outstanding type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd This event counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS. Note: A prefetch promoted to Demand is counted from the promotion point. ++ 0x2 extra: demand_code_rd This event counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The "Offcore outstanding" state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. ++ 0x4 extra: demand_rfo This event counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++ 0x8 extra: all_data_rd This event counts the number of offcore outstanding cacheable Core Data Read transactions in the super queue every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++ 0x1 extra: cycles_with_demand_data_rd This event counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). ++ 0x8 extra: cycles_with_data_rd This event counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++name:lock_cycles type:exclusive default:0x1 ++ 0x1 extra: split_lock_uc_lock_duration This event counts cycles in which the L1 and L2 are locked due to a UC lock or split lock. A lock is asserted in case of locked memory access, due to noncacheable memory, locked operation that spans two cache lines, or a page walk from the noncacheable page table. L1D and L2 locks have a very high performance penalty and it is highly recommended to avoid such access. ++ 0x2 extra: cache_lock_duration This event counts the number of cycles when the L1D is locked. It is a superset of the 0x1 mask (BUS_LOCK_CLOCKS.BUS_LOCK_DURATION). ++name:idq type:exclusive default:0x2 ++ 0x2 extra: empty This counts the number of cycles that the instruction decoder queue is empty and can indicate that the application may be bound in the front end. It does not determine whether there are uops being delivered to the Alloc stage since uops can be delivered by bypass skipping the Instruction Decode Queue (IDQ) when it is empty. ++ 0x4 extra: mite_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x8 extra: dsb_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra: ms_dsb_uops This event counts the number of uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x20 extra: ms_mite_uops This event counts the number of uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x30 extra: ms_uops This event counts the total number of uops delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. ++ 0x30 extra: ms_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. ++ 0x4 extra: mite_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. ++ 0x8 extra: dsb_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra: ms_dsb_cycles This event counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra:edge ms_dsb_occur This event counts the number of deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra: all_dsb_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra: all_dsb_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x24 extra: all_mite_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x24 extra: all_mite_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x3c extra: mite_all_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x30 extra:edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++name:itlb_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). ++ 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. ++ 0x10 extra: walk_duration This event counts the number of cycles while PMH is busy with the page walk. ++ 0x20 extra: stlb_hit_4k Core misses that miss the DTLB and hit the STLB (4K) ++ 0xe extra: walk_completed Misses in all ITLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Operations that miss the first ITLB level but hit the second and do not cause any page walks ++name:br_inst_exec type:exclusive default:0xff ++ 0xff extra: all_branches This event counts both taken and not taken speculative and retired branch instructions. ++ 0x41 extra: nontaken_conditional This event counts not taken macro-conditional branch instructions. ++ 0x81 extra: taken_conditional This event counts taken speculative and retired macro-conditional branch instructions. ++ 0x82 extra: taken_direct_jump This event counts taken speculative and retired macro-conditional branch instructions excluding calls and indirect branches. ++ 0x84 extra: taken_indirect_jump_non_call_ret This event counts taken speculative and retired indirect branches excluding calls and return branches. ++ 0x88 extra: taken_indirect_near_return This event counts taken speculative and retired indirect branches that have a return mnemonic. ++ 0x90 extra: taken_direct_near_call This event counts taken speculative and retired direct near calls. ++ 0xa0 extra: taken_indirect_near_call This event counts taken speculative and retired indirect calls including both register and memory indirect. ++ 0xc1 extra: all_conditional This event counts both taken and not taken speculative and retired macro-conditional branch instructions. ++ 0xc2 extra: all_direct_jmp This event counts both taken and not taken speculative and retired macro-unconditional branch instructions, excluding calls and indirects. ++ 0xc4 extra: all_indirect_jump_non_call_ret This event counts both taken and not taken speculative and retired indirect branches excluding calls and return branches. ++ 0xc8 extra: all_indirect_near_return This event counts both taken and not taken speculative and retired indirect branches that have a return mnemonic. ++ 0xd0 extra: all_direct_near_call This event counts both taken and not taken speculative and retired direct near calls. ++name:br_misp_exec type:exclusive default:0xff ++ 0xff extra: all_branches This event counts both taken and not taken speculative and retired mispredicted branch instructions. ++ 0x41 extra: nontaken_conditional This event counts not taken speculative and retired mispredicted macro conditional branch instructions. ++ 0x81 extra: taken_conditional This event counts taken speculative and retired mispredicted macro conditional branch instructions. ++ 0x84 extra: taken_indirect_jump_non_call_ret This event counts taken speculative and retired mispredicted indirect branches excluding calls and returns. ++ 0xc1 extra: all_conditional This event counts both taken and not taken speculative and retired mispredicted macro conditional branch instructions. ++ 0xc4 extra: all_indirect_jump_non_call_ret This event counts both taken and not taken mispredicted indirect branches excluding calls and returns. ++ 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls ++name:idq_uops_not_delivered type:exclusive default:0x1 ++ 0x1 extra: core This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding ?4 ? x? when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread; b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); c. Instruction Decode Queue (IDQ) delivers four uops. ++ 0x1 extra: cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. ++ 0x1 extra: cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. ++ 0x1 extra: cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end ++ 0x1 extra: cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end ++ 0x1 extra:inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. ++name:uops_executed_port type:exclusive default:0x1 ++ 0x1 extra:any port_0_core Cycles per core when uops are exectuted in port 0 ++ 0x2 extra:any port_1_core Cycles per core when uops are exectuted in port 1 ++ 0x4 extra:any port_2_core Cycles per core when uops are dispatched to port 2 ++ 0x8 extra:any port_3_core Cycles per core when uops are dispatched to port 3 ++ 0x10 extra:any port_4_core Cycles per core when uops are exectuted in port 4 ++ 0x20 extra:any port_5_core Cycles per core when uops are exectuted in port 5 ++ 0x40 extra:any port_6_core Cycles per core when uops are exectuted in port 6 ++ 0x80 extra:any port_7_core Cycles per core when uops are dispatched to port 7 ++ 0x1 extra: port_0 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 0. ++ 0x2 extra: port_1 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 1. ++ 0x4 extra: port_2 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 2. ++ 0x8 extra: port_3 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 3. ++ 0x10 extra: port_4 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 4. ++ 0x20 extra: port_5 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 5. ++ 0x40 extra: port_6 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 6. ++ 0x80 extra: port_7 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 7. ++name:uops_dispatched_port type:exclusive default:0x1 ++ 0x1 extra: port_0 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 0. ++ 0x2 extra: port_1 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 1. ++ 0x4 extra: port_2 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 2. ++ 0x8 extra: port_3 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 3. ++ 0x10 extra: port_4 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 4. ++ 0x20 extra: port_5 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 5. ++ 0x40 extra: port_6 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 6. ++ 0x80 extra: port_7 This event counts, on the per-thread basis, cycles during which uops are dispatched from the Reservation Station (RS) to port 7. ++name:resource_stalls type:exclusive default:0x1 ++ 0x1 extra: any This event counts resource-related stall cycles. Reasons for stalls can be as follows: - *any* u-arch structure got full (LB, SB, RS, ROB, BOB, LM, Physical Register Reclaim Table (PRRT), or Physical History Table (PHT) slots) - *any* u-arch structure got empty (like INT/SIMD FreeLists) - FPU control word (FPCW), MXCSR and others. This counts cycles that the pipeline backend blocked uop delivery from the front end. ++ 0x4 extra: rs This event counts stall cycles caused by absence of eligible entries in the reservation station (RS). This may result from RS overflow, or from RS deallocation because of the RS array Write Port allocation scheme (each RS entry has two write ports instead of four. As a result, empty entries could not be used, although RS is not really full). This counts cycles that the pipeline backend blocked uop delivery from the front end. ++ 0x8 extra: sb This event counts stall cycles caused by the store buffer (SB) overflow (excluding draining from synch). This counts cycles that the pipeline backend blocked uop delivery from the front end. ++ 0x10 extra: rob This event counts ROB full stall cycles. This counts cycles that the pipeline backend blocked uop delivery from the front end. ++name:cycle_activity type:exclusive default:0x1 ++ 0x1 extra: cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. ++ 0x8 extra: cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. ++ 0x2 extra: cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) ++ 0x4 extra: cycles_no_execute Counts number of cycles nothing is executed on any execution port. ++ 0x5 extra: stalls_l2_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand* load request missing the L2 cache. (as a footprint) * includes also L1 HW prefetch requests that may or may not be required by demands ++ 0x6 extra: stalls_ldm_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request. ++ 0xc extra: stalls_l1d_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request missing the L1 data cache. ++ 0x8 extra: cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. ++ 0x1 extra: cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. ++ 0x2 extra: cycles_mem_any Cycles while memory subsystem has an outstanding load. ++ 0x4 extra: stalls_total Total execution stalls. ++ 0xc extra: stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. ++ 0x5 extra: stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. ++ 0x6 extra: stalls_mem_any Execution stalls while memory subsystem has an outstanding load. ++name:lsd type:exclusive default:0x1 ++ 0x1 extra: uops Number of Uops delivered by the LSD. Read more on LSD under LSD_REPLAY.REPLAY ++ 0x1 extra: cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra: cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder ++name:offcore_requests type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd This event counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. ++ 0x2 extra: demand_code_rd This event counts both cacheable and noncachaeble code read requests. ++ 0x4 extra: demand_rfo This event counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. ++ 0x8 extra: all_data_rd This event counts the demand and prefetch data reads. All Core Data Reads include cacheable "Demands" and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. ++name:uops_executed type:exclusive default:0x1 ++ 0x1 extra: thread Number of uops to be executed per-thread each cycle. ++ 0x2 extra: core Number of uops executed from any thread ++ 0x1 extra:inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. ++ 0x1 extra: cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread ++ 0x1 extra: cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread ++ 0x1 extra: cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread ++ 0x1 extra: cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++name:page_walker_loads type:exclusive default:0x11 ++ 0x11 extra: dtlb_l1 Number of DTLB page walker hits in the L1+FB ++ 0x21 extra: itlb_l1 Number of ITLB page walker hits in the L1+FB ++ 0x12 extra: dtlb_l2 Number of DTLB page walker hits in the L2 ++ 0x22 extra: itlb_l2 Number of ITLB page walker hits in the L2 ++ 0x14 extra: dtlb_l3 Number of DTLB page walker hits in the L3 + XSNP ++ 0x24 extra: itlb_l3 Number of ITLB page walker hits in the L3 + XSNP ++ 0x18 extra: dtlb_memory Number of DTLB page walker hits in Memory ++name:inst_retired type:exclusive default:0x2 ++ 0x2 extra: x87 This is a non-precise version (that is, does not use PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling. ++ 0x1 extra: prec_dist This is a precise version (that is, uses PEBS) of the event that counts instructions retired. ++name:other_assists type:exclusive default:0x8 ++ 0x8 extra: avx_to_sse This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable. ++ 0x10 extra: sse_to_avx This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable. ++ 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. ++name:uops_retired type:exclusive default:0x1 ++ 0x1 extra: all This is a non-precise version (that is, does not use PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. ++ 0x2 extra: retire_slots This is a non-precise version (that is, does not use PEBS) of the event that counts the number of retirement slots used. ++ 0x1 extra:inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. ++ 0x1 extra:inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. ++name:machine_clears type:exclusive default:0x1 ++ 0x1 extra: cycles This event counts both thread-specific (TS) and all-thread (AT) nukes. ++ 0x2 extra: memory_ordering This event counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following: 1. memory disambiguation, 2. external snoop, or 3. cross SMT-HW-thread snoop (stores) hitting load buffer. ++ 0x4 extra: smc This event counts self-modifying code (SMC) detected, which causes a machine clear. ++ 0x20 extra: maskmov Maskmov false fault - counts number of time ucode passes through Maskmov flow due to instruction's mask being 0 while the flow was completed without raising a fault. ++ 0x1 extra:edge count Number of machine clears (nukes) of any type. ++name:br_inst_retired type:exclusive default:0x1 ++ 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired. ++ 0x2 extra: near_call This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired. ++ 0x8 extra: near_return This is a non-precise version (that is, does not use PEBS) of the event that counts return instructions retired. ++ 0x10 extra: not_taken This is a non-precise version (that is, does not use PEBS) of the event that counts not taken branch instructions retired. ++ 0x20 extra: near_taken This is a non-precise version (that is, does not use PEBS) of the event that counts taken branch instructions retired. ++ 0x40 extra: far_branch This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired. ++ 0x4 extra: all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. ++name:br_misp_retired type:exclusive default:0x1 ++ 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired. ++ 0x4 extra: all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. ++ 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++name:hle_retired type:exclusive default:0x1 ++ 0x1 extra: start Number of times we entered an HLE region; does not count nested transactions ++ 0x2 extra: commit Number of times HLE commit succeeded ++ 0x4 extra: aborted Number of times HLE abort was triggered ++ 0x8 extra: aborted_misc1 Number of times an HLE abort was attributed to a Memory condition (See TSX_Memory event for additional details) ++ 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an HLE abort ++ 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an HLE abort ++ 0x40 extra: aborted_misc4 Number of times HLE caused a fault ++ 0x80 extra: aborted_misc5 Number of times HLE aborted and was not due to the abort conditions in subevents 3-6 ++name:rtm_retired type:exclusive default:0x1 ++ 0x1 extra: start Number of times we entered an RTM region; does not count nested transactions ++ 0x2 extra: commit Number of times RTM commit succeeded ++ 0x4 extra: aborted Number of times RTM abort was triggered ++ 0x8 extra: aborted_misc1 Number of times an RTM abort was attributed to a Memory condition (See TSX_Memory event for additional details) ++ 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an RTM abort ++ 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an RTM abort ++ 0x40 extra: aborted_misc4 Number of times a RTM caused a fault ++ 0x80 extra: aborted_misc5 Number of times RTM aborted and was not due to the abort conditions in subevents 3-6 ++name:fp_assist type:exclusive default:0x1e ++ 0x1e extra: any This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. ++ 0x2 extra: x87_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of x87 floating point (FP) micro-code assist (numeric overflow/underflow, inexact result) when the output value (destination register) is invalid. ++ 0x4 extra: x87_input This is a non-precise version (that is, does not use PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid. ++ 0x8 extra: simd_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention. ++ 0x10 extra: simd_input This is a non-precise version (that is, does not use PEBS) of the event that counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention. ++name:mem_uops_retired type:exclusive default:0x11 ++ 0x11 extra: stlb_miss_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x12 extra: stlb_miss_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x21 extra: lock_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with locked access retired to the architected path. ++ 0x41 extra: split_loads This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x42 extra: split_stores This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x81 extra: all_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. ++ 0x82 extra: all_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. ++name:mem_load_uops_retired type:exclusive default:0x1 ++ 0x1 extra: l1_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source ++ 0x2 extra: l2_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache. ++ 0x4 extra: l3_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required. ++ 0x8 extra: l1_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source. ++ 0x10 extra: l2_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source. ++ 0x20 extra: l3_miss Miss in last-level (L3) cache. Excludes Unknown data-source. ++ 0x40 extra: hit_lfb This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. ++name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++ 0x1 extra: xsnp_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. ++ 0x2 extra: xsnp_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. ++ 0x4 extra: xsnp_hitm This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). ++ 0x8 extra: xsnp_none This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. ++name:l2_trans type:exclusive default:0x80 ++ 0x80 extra: all_requests This event counts transactions that access the L2 pipe including snoops, pagewalks, and so on. ++ 0x1 extra: demand_data_rd This event counts Demand Data Read requests that access L2 cache, including rejects. ++ 0x2 extra: rfo This event counts Read for Ownership (RFO) requests that access L2 cache. ++ 0x4 extra: code_rd This event counts the number of L2 cache accesses when fetching instructions. ++ 0x8 extra: all_pf This event counts L2 or L3 HW prefetches that access L2 cache including rejects. ++ 0x10 extra: l1d_wb This event counts L1D writebacks that access L2 cache. ++ 0x20 extra: l2_fill This event counts L2 fill requests that access L2 cache. ++ 0x40 extra: l2_wb This event counts L2 writebacks that access L2 cache. ++name:l2_lines_in type:exclusive default:0x7 ++ 0x7 extra: all This event counts the number of L2 cache lines filling the L2. Counting does not cover rejects. ++ 0x1 extra: i This event counts the number of L2 cache lines in the Invalidate state filling the L2. Counting does not cover rejects. ++ 0x2 extra: s This event counts the number of L2 cache lines in the Shared state filling the L2. Counting does not cover rejects. ++ 0x4 extra: e This event counts the number of L2 cache lines in the Exclusive state filling the L2. Counting does not cover rejects. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 0cfb4ea..bce230a 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -130,6 +130,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "IBM Power Architected Events V1", "ppc64/architected_events_v1", CPU_PPC64_ARCH_V1, 6 }, + { "ppc64 POWER8", "ppc64/power8", CPU_PPC64_POWER8, 6 }, + { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, ++ { "Intel Broadwell microarchitecture", "i386/broadwell", CPU_BROADWELL, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -670,6 +671,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_ATOM: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_BROADWELL: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 7c478ad..3754156 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -110,6 +110,7 @@ typedef enum { + CPU_PPC64_ARCH_V1, /** < IBM Power architected events version 1 */ + CPU_PPC64_POWER8, /**< ppc64 POWER8 family */ + CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ ++ CPU_BROADWELL, /** < Intel Broadwell (Core-M) microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 968ff04..9c27e6c 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1201,6 +1201,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_BROADWELL: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index e86dcae..1d39692 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -148,8 +148,11 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x3f: + case 0x45: + case 0x46: +- case 0x47: + return CPU_HASWELL; ++ case 0x3d: ++ case 0x47: ++ case 0x4f: ++ return CPU_BROADWELL; + case 0x37: + case 0x4d: + return CPU_SILVERMONT; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 35f47bc..bf3fbcb 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -555,6 +555,7 @@ int main(int argc, char const * argv[]) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_BROADWELL: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +commit 5ce12ed9d20a91f19cba6e8ecadc478fcd57db6c +Author: Andi Kleen +Date: Thu Jul 17 12:45:09 2014 -0500 + + Fix some problems in the Broadwell events + + Fix some problems in the previous commit of the Broadwell events. + Most flags were missing due to a bug in the generation script. + This patch also re-adds proper PEBS events. + + Signed-off-by: Andi Kleen + +diff --git a/events/i386/broadwell/events b/events/i386/broadwell/events +index 6a4b388..ec55836 100644 +--- a/events/i386/broadwell/events ++++ b/events/i386/broadwell/events +@@ -58,7 +58,7 @@ event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_insert + event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : + event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : + event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : +-event:0xd3 counters:0,1,2,3 um:one minimum:100007 name:mem_load_uops_l3_miss_retired_local_dram : ++event:0xd3 counters:0,1,2,3 um:mem_load_uops_l3_miss_retired minimum:100007 name:mem_load_uops_l3_miss_retired : + event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : + event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : + event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +index 470e9e9..0d6ccd5 100644 +--- a/events/i386/broadwell/unit_masks ++++ b/events/i386/broadwell/unit_masks +@@ -36,7 +36,7 @@ name:uops_issued type:exclusive default:0x1 + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. + 0x40 extra: single_mul Number of Multiply packed/scalar single precision uops allocated +- 0x1 extra:inv stall_cycles This event counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread. ++ 0x1 extra:cmask=1,inv stall_cycles This event counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread. + name:l2_rqsts type:exclusive default:0x21 + 0x21 extra: demand_data_rd_miss This event counts the number of demand Data Read requests that miss L2 cache. Only not rejected loads are counted. + 0x41 extra: demand_data_rd_hit This event counts the number of demand Data Read requests that hit L2 cache. Only not rejected loads are counted. +@@ -56,7 +56,7 @@ name:l2_rqsts type:exclusive default:0x21 + 0xff extra: references All L2 requests + name:l1d_pend_miss type:exclusive default:0x1 + 0x1 extra: pending This event counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand; from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. +- 0x1 extra: pending_cycles This event counts duration of L1D miss outstanding in cycles. ++ 0x1 extra:cmask=1 pending_cycles This event counts duration of L1D miss outstanding in cycles. + name:dtlb_store_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). + 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. +@@ -80,7 +80,7 @@ name:move_elimination type:exclusive default:0x1 + name:cpl_cycles type:exclusive default:0x1 + 0x1 extra: ring0 This event counts the unhalted core cycles during which the thread is in the ring 0 privileged mode. + 0x2 extra: ring123 This event counts unhalted core cycles during which the thread is in rings 1, 2, or 3. +- 0x1 extra:edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. ++ 0x1 extra:cmask=1,edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. + name:tx_exec type:exclusive default:0x1 + 0x1 extra: misc1 Unfriendly TSX abort triggered by a flowmarker + 0x2 extra: misc2 Unfriendly TSX abort triggered by a vzeroupper instruction +@@ -89,14 +89,14 @@ name:tx_exec type:exclusive default:0x1 + 0x10 extra: misc5 # HLE inside HLE+ + name:rs_events type:exclusive default:0x1 + 0x1 extra: empty_cycles This event counts cycles during which the reservation station (RS) is empty for the thread. Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues. +- 0x1 extra:inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++ 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. + name:offcore_requests_outstanding type:exclusive default:0x1 + 0x1 extra: demand_data_rd This event counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS. Note: A prefetch promoted to Demand is counted from the promotion point. + 0x2 extra: demand_code_rd This event counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The "Offcore outstanding" state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. + 0x4 extra: demand_rfo This event counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. + 0x8 extra: all_data_rd This event counts the number of offcore outstanding cacheable Core Data Read transactions in the super queue every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. +- 0x1 extra: cycles_with_demand_data_rd This event counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). +- 0x8 extra: cycles_with_data_rd This event counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. ++ 0x1 extra:cmask=1 cycles_with_demand_data_rd This event counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). ++ 0x8 extra:cmask=1 cycles_with_data_rd This event counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. + name:lock_cycles type:exclusive default:0x1 + 0x1 extra: split_lock_uc_lock_duration This event counts cycles in which the L1 and L2 are locked due to a UC lock or split lock. A lock is asserted in case of locked memory access, due to noncacheable memory, locked operation that spans two cache lines, or a page walk from the noncacheable page table. L1D and L2 locks have a very high performance penalty and it is highly recommended to avoid such access. + 0x2 extra: cache_lock_duration This event counts the number of cycles when the L1D is locked. It is a superset of the 0x1 mask (BUS_LOCK_CLOCKS.BUS_LOCK_DURATION). +@@ -107,17 +107,17 @@ name:idq type:exclusive default:0x2 + 0x10 extra: ms_dsb_uops This event counts the number of uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. + 0x20 extra: ms_mite_uops This event counts the number of uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. + 0x30 extra: ms_uops This event counts the total number of uops delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. +- 0x30 extra: ms_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. +- 0x4 extra: mite_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. +- 0x8 extra: dsb_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. +- 0x10 extra: ms_dsb_cycles This event counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. +- 0x10 extra:edge ms_dsb_occur This event counts the number of deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. +- 0x18 extra: all_dsb_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. +- 0x18 extra: all_dsb_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. +- 0x24 extra: all_mite_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). +- 0x24 extra: all_mite_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x30 extra:cmask=1 ms_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequenser (MS) is busy. Counting includes uops that may "bypass" the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. ++ 0x4 extra:cmask=1 mite_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. ++ 0x8 extra:cmask=1 dsb_cycles This event counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra:cmask=1 ms_dsb_cycles This event counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x10 extra:cmask=1,edge ms_dsb_occur This event counts the number of deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while the Microcode Sequencer (MS) is busy. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra:cmask=4 all_dsb_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x18 extra:cmask=1 all_dsb_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may "bypass" the IDQ. ++ 0x24 extra:cmask=4 all_mite_cycles_4_uops This event counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). ++ 0x24 extra:cmask=1 all_mite_cycles_any_uops This event counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). + 0x3c extra: mite_all_uops This event counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may "bypass" the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). +- 0x30 extra:edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++ 0x30 extra:cmask=1,edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer + name:itlb_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk This event counts store misses in all DTLB levels that cause page walks of any page size (4K/2M/4M/1G). + 0x2 extra: walk_completed_4k This event counts store misses in all DTLB levels that cause a completed page walk (4K page size). The page walk can end with or without a fault. +@@ -149,11 +149,11 @@ name:br_misp_exec type:exclusive default:0xff + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls + name:idq_uops_not_delivered type:exclusive default:0x1 + 0x1 extra: core This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding ?4 ? x? when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread; b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); c. Instruction Decode Queue (IDQ) delivers four uops. +- 0x1 extra: cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. +- 0x1 extra: cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. +- 0x1 extra: cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end +- 0x1 extra: cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end +- 0x1 extra:inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. ++ 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. ++ 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. ++ 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end ++ 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end ++ 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. + name:uops_executed_port type:exclusive default:0x1 + 0x1 extra:any port_0_core Cycles per core when uops are exectuted in port 0 + 0x2 extra:any port_1_core Cycles per core when uops are exectuted in port 1 +@@ -186,24 +186,24 @@ name:resource_stalls type:exclusive default:0x1 + 0x8 extra: sb This event counts stall cycles caused by the store buffer (SB) overflow (excluding draining from synch). This counts cycles that the pipeline backend blocked uop delivery from the front end. + 0x10 extra: rob This event counts ROB full stall cycles. This counts cycles that the pipeline backend blocked uop delivery from the front end. + name:cycle_activity type:exclusive default:0x1 +- 0x1 extra: cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. +- 0x8 extra: cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. +- 0x2 extra: cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) +- 0x4 extra: cycles_no_execute Counts number of cycles nothing is executed on any execution port. +- 0x5 extra: stalls_l2_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand* load request missing the L2 cache. (as a footprint) * includes also L1 HW prefetch requests that may or may not be required by demands +- 0x6 extra: stalls_ldm_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request. +- 0xc extra: stalls_l1d_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request missing the L1 data cache. +- 0x8 extra: cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. +- 0x1 extra: cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. +- 0x2 extra: cycles_mem_any Cycles while memory subsystem has an outstanding load. +- 0x4 extra: stalls_total Total execution stalls. +- 0xc extra: stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. +- 0x5 extra: stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. +- 0x6 extra: stalls_mem_any Execution stalls while memory subsystem has an outstanding load. ++ 0x1 extra:cmask=1 cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. ++ 0x8 extra:cmask=8 cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. ++ 0x2 extra:cmask=2 cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) ++ 0x4 extra:cmask=4 cycles_no_execute Counts number of cycles nothing is executed on any execution port. ++ 0x5 extra:cmask=5 stalls_l2_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand* load request missing the L2 cache. (as a footprint) * includes also L1 HW prefetch requests that may or may not be required by demands ++ 0x6 extra:cmask=6 stalls_ldm_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request. ++ 0xc extra:cmask=c stalls_l1d_pending Counts number of cycles nothing is executed on any execution port, while there was at least one pending demand load request missing the L1 data cache. ++ 0x8 extra:cmask=8 cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. ++ 0x1 extra:cmask=1 cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. ++ 0x2 extra:cmask=2 cycles_mem_any Cycles while memory subsystem has an outstanding load. ++ 0x4 extra:cmask=4 stalls_total Total execution stalls. ++ 0xc extra:cmask=c stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. ++ 0x5 extra:cmask=5 stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. ++ 0x6 extra:cmask=6 stalls_mem_any Execution stalls while memory subsystem has an outstanding load. + name:lsd type:exclusive default:0x1 + 0x1 extra: uops Number of Uops delivered by the LSD. Read more on LSD under LSD_REPLAY.REPLAY +- 0x1 extra: cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder +- 0x1 extra: cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra:cmask=4 cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra:cmask=1 cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder + name:offcore_requests type:exclusive default:0x1 + 0x1 extra: demand_data_rd This event counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. + 0x2 extra: demand_code_rd This event counts both cacheable and noncachaeble code read requests. +@@ -212,11 +212,11 @@ name:offcore_requests type:exclusive default:0x1 + name:uops_executed type:exclusive default:0x1 + 0x1 extra: thread Number of uops to be executed per-thread each cycle. + 0x2 extra: core Number of uops executed from any thread +- 0x1 extra:inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. +- 0x1 extra: cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread +- 0x1 extra: cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread +- 0x1 extra: cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread +- 0x1 extra: cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++ 0x1 extra:cmask=1,inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. ++ 0x1 extra:cmask=1 cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread ++ 0x1 extra:cmask=2 cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread ++ 0x1 extra:cmask=3 cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread ++ 0x1 extra:cmask=4 cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread + name:page_walker_loads type:exclusive default:0x11 + 0x11 extra: dtlb_l1 Number of DTLB page walker hits in the L1+FB + 0x21 extra: itlb_l1 Number of ITLB page walker hits in the L1+FB +@@ -227,38 +227,47 @@ name:page_walker_loads type:exclusive default:0x11 + 0x18 extra: dtlb_memory Number of DTLB page walker hits in Memory + name:inst_retired type:exclusive default:0x2 + 0x2 extra: x87 This is a non-precise version (that is, does not use PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling. +- 0x1 extra: prec_dist This is a precise version (that is, uses PEBS) of the event that counts instructions retired. ++ 0x1 extra:pebs prec_dist This is a precise version (that is, uses PEBS) of the event that counts instructions retired. + name:other_assists type:exclusive default:0x8 + 0x8 extra: avx_to_sse This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable. + 0x10 extra: sse_to_avx This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable. + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. + name:uops_retired type:exclusive default:0x1 + 0x1 extra: all This is a non-precise version (that is, does not use PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. ++ 0x1 extra: all_pebs Counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. + 0x2 extra: retire_slots This is a non-precise version (that is, does not use PEBS) of the event that counts the number of retirement slots used. +- 0x1 extra:inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. +- 0x1 extra:inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. ++ 0x2 extra: retire_slots_pebs Counts the number of retirement slots used. ++ 0x1 extra:cmask=1,inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. ++ 0x1 extra:cmask=a,inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. + name:machine_clears type:exclusive default:0x1 + 0x1 extra: cycles This event counts both thread-specific (TS) and all-thread (AT) nukes. + 0x2 extra: memory_ordering This event counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following: 1. memory disambiguation, 2. external snoop, or 3. cross SMT-HW-thread snoop (stores) hitting load buffer. + 0x4 extra: smc This event counts self-modifying code (SMC) detected, which causes a machine clear. + 0x20 extra: maskmov Maskmov false fault - counts number of time ucode passes through Maskmov flow due to instruction's mask being 0 while the flow was completed without raising a fault. +- 0x1 extra:edge count Number of machine clears (nukes) of any type. ++ 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. + name:br_inst_retired type:exclusive default:0x1 + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired. ++ 0x1 extra: conditional_pebs Counts conditional branch instructions retired. + 0x2 extra: near_call This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired. ++ 0x2 extra: near_call_pebs Counts both direct and indirect near call instructions retired. + 0x8 extra: near_return This is a non-precise version (that is, does not use PEBS) of the event that counts return instructions retired. ++ 0x8 extra: near_return_pebs Counts return instructions retired. + 0x10 extra: not_taken This is a non-precise version (that is, does not use PEBS) of the event that counts not taken branch instructions retired. + 0x20 extra: near_taken This is a non-precise version (that is, does not use PEBS) of the event that counts taken branch instructions retired. ++ 0x20 extra: near_taken_pebs Counts taken branch instructions retired. + 0x40 extra: far_branch This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired. +- 0x4 extra: all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. + name:br_misp_retired type:exclusive default:0x1 + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired. +- 0x4 extra: all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. ++ 0x1 extra: conditional_pebs Counts mispredicted conditional branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. + 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++ 0x20 extra: near_taken_pebs number of near branch instructions retired that were mispredicted and taken. + name:hle_retired type:exclusive default:0x1 + 0x1 extra: start Number of times we entered an HLE region; does not count nested transactions + 0x2 extra: commit Number of times HLE commit succeeded + 0x4 extra: aborted Number of times HLE abort was triggered ++ 0x4 extra: aborted_pebs Number of times HLE abort was triggered + 0x8 extra: aborted_misc1 Number of times an HLE abort was attributed to a Memory condition (See TSX_Memory event for additional details) + 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an HLE abort + 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an HLE abort +@@ -268,38 +277,60 @@ name:rtm_retired type:exclusive default:0x1 + 0x1 extra: start Number of times we entered an RTM region; does not count nested transactions + 0x2 extra: commit Number of times RTM commit succeeded + 0x4 extra: aborted Number of times RTM abort was triggered ++ 0x4 extra: aborted_pebs Number of times RTM abort was triggered + 0x8 extra: aborted_misc1 Number of times an RTM abort was attributed to a Memory condition (See TSX_Memory event for additional details) + 0x10 extra: aborted_misc2 Number of times the TSX watchdog signaled an RTM abort + 0x20 extra: aborted_misc3 Number of times a disallowed operation caused an RTM abort + 0x40 extra: aborted_misc4 Number of times a RTM caused a fault + 0x80 extra: aborted_misc5 Number of times RTM aborted and was not due to the abort conditions in subevents 3-6 + name:fp_assist type:exclusive default:0x1e +- 0x1e extra: any This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. ++ 0x1e extra:cmask=1 any This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. + 0x2 extra: x87_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of x87 floating point (FP) micro-code assist (numeric overflow/underflow, inexact result) when the output value (destination register) is invalid. + 0x4 extra: x87_input This is a non-precise version (that is, does not use PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid. + 0x8 extra: simd_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention. + 0x10 extra: simd_input This is a non-precise version (that is, does not use PEBS) of the event that counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention. + name:mem_uops_retired type:exclusive default:0x11 + 0x11 extra: stlb_miss_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x11 extra: stlb_miss_loads_pebs Counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x12 extra: stlb_miss_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. ++ 0x12 extra: stlb_miss_stores_pebs Counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x21 extra: lock_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with locked access retired to the architected path. ++ 0x21 extra: lock_loads_pebs Counts load uops with locked access retired to the architected path. + 0x41 extra: split_loads This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x41 extra: split_loads_pebs Counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). + 0x42 extra: split_stores This is a non-precise version (that is, does not use PEBS) of the event that counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). ++ 0x42 extra: split_stores_pebs Counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). + 0x81 extra: all_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. ++ 0x81 extra: all_loads_pebs Counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. + 0x82 extra: all_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. ++ 0x82 extra: all_stores_pebs Counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. + name:mem_load_uops_retired type:exclusive default:0x1 + 0x1 extra: l1_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source ++ 0x1 extra: l1_hit_pebs Counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source + 0x2 extra: l2_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache. ++ 0x2 extra: l2_hit_pebs Counts retired load uops which data sources were hits in the mid-level (L2) cache. + 0x4 extra: l3_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required. ++ 0x4 extra: l3_hit_pebs Counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required. + 0x8 extra: l1_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source. ++ 0x8 extra: l1_miss_pebs Counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source. + 0x10 extra: l2_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source. ++ 0x10 extra: l2_miss_pebs Counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source. + 0x20 extra: l3_miss Miss in last-level (L3) cache. Excludes Unknown data-source. ++ 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. + 0x40 extra: hit_lfb This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. ++ 0x40 extra: hit_lfb_pebs Counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. + name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 + 0x1 extra: xsnp_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. ++ 0x1 extra: xsnp_miss_pebs Counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. + 0x2 extra: xsnp_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. ++ 0x2 extra: xsnp_hit_pebs Counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. + 0x4 extra: xsnp_hitm This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). ++ 0x4 extra: xsnp_hitm_pebs Counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). + 0x8 extra: xsnp_none This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. ++ 0x8 extra: xsnp_none_pebs Counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. ++name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++ 0x1 extra: local_dram Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) ++ 0x1 extra: local_dram_pebs Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) + name:l2_trans type:exclusive default:0x80 + 0x80 extra: all_requests This event counts transactions that access the L2 pipe including snoops, pagewalks, and so on. + 0x1 extra: demand_data_rd This event counts Demand Data Read requests that access L2 cache, including rejects. +commit 893c18c2a2ba955bc77140bbd7696cc2d3f6e1dc +Author: Andi Kleen +Date: Thu Jul 17 12:55:42 2014 -0500 + + Improve error message for non-unique unit mask + + For the case where the user does not specify a UM and the default UM + is a non-unique hex value, the error message printed is the following: + + Default unit mask not supported for this event. + Please specify a unit mask by name, using the first word of the unit mask description. + + For cases where the user wrongly specifies a non-unique hex value for a UM + when they should have specified it by name, the message will be like the + following example: + + Unit mask (0x1) is non unique. + Please specify a unit mask by name, using the first word of the unit mask description. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_events.c b/libop/op_events.c +index 9c27e6c..b8900a5 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1389,6 +1389,7 @@ static void do_resolve_unit_mask(struct op_event *e, + if (pe->unit_mask_name == NULL) { + /* For numerical unit mask */ + int found = 0; ++ int old_um_valid = pe->unit_mask_valid; + + /* Use default unitmask if not specified */ + if (!pe->unit_mask_valid) { +@@ -1404,9 +1405,16 @@ static void do_resolve_unit_mask(struct op_event *e, + found++; + } + if (found > 1) { +- fprintf(stderr, "Unit mask (0x%x) is non unique.\n" +- "Please specify the unit mask using the first " +- "word of the description\n", ++ if (!old_um_valid) ++ fprintf(stderr, ++ "Default unit mask not supported for this event.\n" ++ "Please speicfy a unit mask by name, using the first " ++ "word of the unit mask description\n"); ++ else ++ fprintf(stderr, ++ "Unit mask (0x%x) is non unique.\n" ++ "Please specify the unit mask using the first " ++ "word of the description\n", + pe->unit_mask); + exit(EXIT_FAILURE); + } diff --git a/SOURCES/oprofile-haswell.patch b/SOURCES/oprofile-haswell.patch new file mode 100644 index 0000000..6c02359 --- /dev/null +++ b/SOURCES/oprofile-haswell.patch @@ -0,0 +1,570 @@ +commit 5f11ddb982931f754d3319a64313cf880424ea73 +Author: Andi Kleen +Date: Thu Jul 17 16:23:38 2014 -0500 + + Update the Haswell events to the latest version + + Some minor changes to the previous version, but it should be more + consistent with other tools now. + + The event name descriptions have been dropped. They were never all that + useful anyways because the event is defined by the unit masks. + Now all events with more than one unit mask only have a description + in the unit masks. + + As a new feature any known Errata to the event are referenced. + + Signed-off-by: Andi Kleen + +diff --git a/events/i386/haswell/events b/events/i386/haswell/events +index 51fcd50..5aa5eb5 100644 +--- a/events/i386/haswell/events ++++ b/events/i386/haswell/events +@@ -7,54 +7,58 @@ + # lowered in many cases without ill effect. + # + include:i386/arch_perfmon +-event:0x03 counters:cpuid um:x02 minimum:100003 name:ld_blocks_store_forward : Cases when loads get true Block-on-Store blocking code preventing store forwarding +-event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000003 name:misalign_mem_ref : misalign_mem_ref +-event:0x07 counters:cpuid um:one minimum:100003 name:ld_blocks_partial_address_alias : False dependencies in MOB due to partial address comparison +-event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : dtlb_load_misses +-event:0x0d counters:cpuid um:x03 minimum:2000003 name:int_misc_recovery_cycles : Number of cycles waiting for the checkpoints in Resource Allocation Table (RAT) to be recovered after Nuke due to all other cases except JEClear (e.g. whenever a ucode assist is needed like SSE exception, memory disambiguation, etc...) +-event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : uops_issued +-event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : l2_rqsts +-event:0x27 counters:cpuid um:x50 minimum:200003 name:l2_demand_rqsts_wb_hit : Not rejected writebacks that hit L2 cache +-event:0x48 counters:2 um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : l1d_pend_miss +-event:0x49 counters:cpuid um:dtlb_store_misses minimum:100003 name:dtlb_store_misses : dtlb_store_misses +-event:0x4c counters:cpuid um:load_hit_pre minimum:100003 name:load_hit_pre : load_hit_pre +-event:0x51 counters:cpuid um:one minimum:2000003 name:l1d_replacement : L1D data line replacements +-event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : tx_mem +-event:0x58 counters:cpuid um:move_elimination minimum:1000003 name:move_elimination : move_elimination +-event:0x5c counters:cpuid um:cpl_cycles minimum:2000003 name:cpl_cycles : cpl_cycles +-event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : tx_exec +-event:0x5e counters:cpuid um:one minimum:2000003 name:rs_events_empty_cycles : Cycles when Reservation Station (RS) is empty for the thread +-event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles : lock_cycles +-event:0x79 counters:0,1,2,3 um:idq minimum:2000003 name:idq : idq +-event:0x80 counters:cpuid um:x02 minimum:200003 name:icache_misses : Number of Instruction Cache, Streaming Buffer and Victim Cache Misses. Includes Uncacheable accesses. +-event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : itlb_misses +-event:0x87 counters:cpuid um:ild_stall minimum:2000003 name:ild_stall : ild_stall +-event:0x88 counters:cpuid um:br_inst_exec minimum:200003 name:br_inst_exec : br_inst_exec +-event:0x89 counters:cpuid um:br_misp_exec minimum:200003 name:br_misp_exec : br_misp_exec +-event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : idq_uops_not_delivered +-event:0xa1 counters:cpuid um:uops_executed_port minimum:2000003 name:uops_executed_port : uops_executed_port +-event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : resource_stalls +-event:0xa3 counters:2 um:cycle_activity minimum:2000003 name:cycle_activity : cycle_activity +-event:0xae counters:cpuid um:one minimum:100007 name:itlb_itlb_flush : Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages. +-event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : offcore_requests +-event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : uops_executed +-event:0xbc counters:0,1,2,3 um:page_walker_loads minimum:2000003 name:page_walker_loads : page_walker_loads +-event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : tlb_flush +-event:0xc0 counters:1 um:one minimum:2000003 name:inst_retired_prec_dist : Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution +-event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists : other_assists +-event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : uops_retired +-event:0xc3 counters:cpuid um:machine_clears minimum:100003 name:machine_clears : machine_clears +-event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : br_inst_retired +-event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : br_misp_retired +-event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : hle_retired +-event:0xc9 counters:cpuid um:rtm_retired minimum:2000003 name:rtm_retired : rtm_retired +-event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist : fp_assist +-event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_inserts : Count cases of saving new LBR +-event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : mem_uops_retired +-event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : mem_load_uops_retired +-event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : mem_load_uops_l3_hit_retired +-event:0xd3 counters:0,1,2,3 um:one minimum:100007 name:mem_load_uops_l3_miss_retired_local_dram : Data from local DRAM either Snoop not needed or Snoop Miss (RspI) +-event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end. +-event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : l2_trans +-event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : l2_lines_in +-event:0xf2 counters:cpuid um:l2_lines_out minimum:100003 name:l2_lines_out : l2_lines_out ++event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : ++event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000003 name:misalign_mem_ref : ++event:0x07 counters:cpuid um:one minimum:100003 name:ld_blocks_partial_address_alias : ++event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : ++event:0x0d counters:cpuid um:x03 minimum:2000003 name:int_misc_recovery_cycles : ++event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : ++event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : ++event:0x27 counters:cpuid um:x50 minimum:200003 name:l2_demand_rqsts_wb_hit : ++event:0x48 counters:2 um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : ++event:0x49 counters:cpuid um:dtlb_store_misses minimum:100003 name:dtlb_store_misses : ++event:0x4c counters:cpuid um:load_hit_pre minimum:100003 name:load_hit_pre : ++event:0x4f counters:cpuid um:x10 minimum:2000003 name:ept_walk_cycles : ++event:0x51 counters:cpuid um:one minimum:2000003 name:l1d_replacement : ++event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : ++event:0x58 counters:cpuid um:move_elimination minimum:1000003 name:move_elimination : ++event:0x5c counters:cpuid um:cpl_cycles minimum:2000003 name:cpl_cycles : ++event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : ++event:0x5e counters:cpuid um:rs_events minimum:2000003 name:rs_events : ++event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000003 name:offcore_requests_outstanding : ++event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles : ++event:0x79 counters:0,1,2,3 um:idq minimum:2000003 name:idq : ++event:0x80 counters:cpuid um:icache minimum:2000003 name:icache : ++event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : ++event:0x87 counters:cpuid um:ild_stall minimum:2000003 name:ild_stall : ++event:0x88 counters:cpuid um:br_inst_exec minimum:200003 name:br_inst_exec : ++event:0x89 counters:cpuid um:br_misp_exec minimum:200003 name:br_misp_exec : ++event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : ++event:0xa1 counters:cpuid um:uops_executed_port minimum:2000003 name:uops_executed_port : ++event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : ++event:0xa3 counters:2 um:cycle_activity minimum:2000003 name:cycle_activity : ++event:0xa8 counters:cpuid um:one minimum:2000003 name:lsd_uops : ++event:0xab counters:cpuid um:x02 minimum:2000003 name:dsb2mite_switches_penalty_cycles : ++event:0xae counters:cpuid um:one minimum:100007 name:itlb_itlb_flush : ++event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : ++event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : ++event:0xbc counters:0,1,2,3 um:page_walker_loads minimum:2000003 name:page_walker_loads : ++event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : ++event:0xc0 counters:1 um:one minimum:2000003 name:inst_retired_prec_dist : ++event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:2000003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : ++event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : ++event:0xc9 counters:0,1,2,3 um:rtm_retired minimum:2000003 name:rtm_retired : ++event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist : ++event:0xcc counters:cpuid um:x20 minimum:2000003 name:rob_misc_events_lbr_inserts : ++event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000003 name:mem_uops_retired : ++event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000003 name:mem_load_uops_retired : ++event:0xd2 counters:0,1,2,3 um:mem_load_uops_l3_hit_retired minimum:100003 name:mem_load_uops_l3_hit_retired : ++event:0xd3 counters:0,1,2,3 um:mem_load_uops_l3_miss_retired minimum:100007 name:mem_load_uops_l3_miss_retired : ++event:0xe6 counters:cpuid um:x1f minimum:100003 name:baclears_any : ++event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans : ++event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in : ++event:0xf2 counters:cpuid um:l2_lines_out minimum:100003 name:l2_lines_out : +diff --git a/events/i386/haswell/unit_masks b/events/i386/haswell/unit_masks +index 32e1c1e..60c2a61 100644 +--- a/events/i386/haswell/unit_masks ++++ b/events/i386/haswell/unit_masks +@@ -8,27 +8,32 @@ name:x02 type:mandatory default:0x2 + 0x2 No unit mask + name:x03 type:mandatory default:0x3 + 0x3 No unit mask ++name:x10 type:mandatory default:0x10 ++ 0x10 No unit mask + name:x1f type:mandatory default:0x1f + 0x1f No unit mask + name:x20 type:mandatory default:0x20 + 0x20 No unit mask + name:x50 type:mandatory default:0x50 + 0x50 No unit mask ++name:ld_blocks type:exclusive default:0x2 ++ 0x2 extra: store_forward This event counts loads that followed a store to the same address, where the data could not be forwarded inside the pipeline from the store to the load. The most common reason why store forwarding would be blocked is when a load's address range overlaps with a preceding smaller uncompleted store. The penalty for blocked store forwarding is that the load must wait for the store to write its value to the cache before it can be issued. ++ 0x8 extra: no_sr The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use + name:misalign_mem_ref type:exclusive default:0x1 + 0x1 extra: loads Speculative cache line split load uops dispatched to L1 cache + 0x2 extra: stores Speculative cache line split STA uops dispatched to L1 cache + name:dtlb_load_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk Load misses in all DTLB levels that cause page walks +- 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. + 0x2 extra: walk_completed_4k Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (4K). + 0x4 extra: walk_completed_2m_4m Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (2M/4M). +- 0x10 extra: walk_duration Cycles when PMH is busy with page walks +- 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks +- 0x20 extra: stlb_hit_4k Load misses that miss the DTLB and hit the STLB (4K) +- 0x40 extra: stlb_hit_2m Load misses that miss the DTLB and hit the STLB (2M) ++ 0x10 extra: walk_duration This event counts cycles when the page miss handler (PMH) is servicing page walks caused by DTLB load misses. ++ 0x20 extra: stlb_hit_4k This event counts load operations from a 4K page that miss the first DTLB level but hit the second and do not cause page walks. ++ 0x40 extra: stlb_hit_2m This event counts load operations from a 2M page that miss the first DTLB level but hit the second and do not cause page walks. + 0x80 extra: pde_cache_miss DTLB demand load misses with low part of linear-to-physical address translation missed +-name:uops_issued type:exclusive default:any +- 0x1 extra: any Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS) ++ 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. ++ 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks ++name:uops_issued type:exclusive default:0x1 ++ 0x1 extra: any This event counts the number of uops issued by the Front-end of the pipeline to the Back-end. This event is counted at the allocation stage and will count both retired and non-retired uops. + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. + 0x40 extra: single_mul Number of Multiply packed/scalar single precision uops allocated +@@ -47,49 +52,59 @@ name:l2_rqsts type:exclusive default:0x21 + 0x22 extra: rfo_miss RFO requests that miss L2 cache + 0x44 extra: code_rd_hit L2 cache hits when fetching instructions, code reads. + 0x24 extra: code_rd_miss L2 cache misses when fetching instructions +- 0x27 extra: all_demand_miss Demand requests that miss L2 cache +- 0xe7 extra: all_demand_references Demand requests to L2 cache +- 0x3f extra: miss All requests that miss L2 cache +- 0xff extra: references All L2 requests +-name:l1d_pend_miss type:exclusive default:pending ++ 0x27 extra: all_demand_miss Demand requests that miss L2 cache ++ 0xe7 extra: all_demand_references Demand requests to L2 cache ++ 0x3f extra: miss All requests that miss L2 cache ++ 0xff extra: references All L2 requests ++name:l1d_pend_miss type:exclusive default:0x1 + 0x1 extra: pending L1D miss oustandings duration in cycles + 0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding. +- 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding, using an edge detect to count transitions. + name:dtlb_store_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk Store misses in all DTLB levels that cause page walks +- 0xe extra: walk_completed Store misses in all DTLB levels that cause completed page walks +- 0x2 extra: walk_completed_4k Store miss in all TLB levels causes a page walk that completes. (4K) ++ 0x2 extra: walk_completed_4k Store miss in all TLB levels causes a page walk that completes. (4K) + 0x4 extra: walk_completed_2m_4m Store misses in all DTLB levels that cause completed page walks (2M/4M) +- 0x10 extra: walk_duration Cycles when PMH is busy with page walks +- 0x60 extra: stlb_hit Store operations that miss the first TLB level but hit the second and do not cause page walks +- 0x20 extra: stlb_hit_4k Store misses that miss the DTLB and hit the STLB (4K) +- 0x40 extra: stlb_hit_2m Store misses that miss the DTLB and hit the STLB (2M) ++ 0x10 extra: walk_duration This event counts cycles when the page miss handler (PMH) is servicing page walks caused by DTLB store misses. ++ 0x20 extra: stlb_hit_4k This event counts store operations from a 4K page that miss the first DTLB level but hit the second and do not cause page walks. ++ 0x40 extra: stlb_hit_2m This event counts store operations from a 2M page that miss the first DTLB level but hit the second and do not cause page walks. + 0x80 extra: pde_cache_miss DTLB store misses with low part of linear-to-physical address translation missed ++ 0xe extra: walk_completed Store misses in all DTLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Store operations that miss the first TLB level but hit the second and do not cause page walks + name:load_hit_pre type:exclusive default:0x1 + 0x1 extra: sw_pf Not software-prefetch load dispatches that hit FB allocated for software prefetch + 0x2 extra: hw_pf Not software-prefetch load dispatches that hit FB allocated for hardware prefetch + name:tx_mem type:exclusive default:0x1 + 0x1 extra: abort_conflict Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address +- 0x2 extra: abort_capacity Number of times a transactional abort was signaled due to a data capacity limitation ++ 0x2 extra: abort_capacity_write Number of times a transactional abort was signaled due to a data capacity limitation for transactional writes. + 0x4 extra: abort_hle_store_to_elided_lock Number of times a HLE transactional region aborted due to a non XRELEASE prefixed instruction writing to an elided lock in the elision buffer + 0x8 extra: abort_hle_elision_buffer_not_empty Number of times an HLE transactional execution aborted due to NoAllocatedElisionBuffer being non-zero. +- 0x10 extra: abort_hle_elision_buffer_mismatch Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer. ++ 0x10 extra: abort_hle_elision_buffer_mismatch Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer + 0x20 extra: abort_hle_elision_buffer_unsupported_alignment Number of times an HLE transactional execution aborted due to an unsupported read alignment from the elision buffer. +- 0x40 extra: abort_hle_elision_buffer_full Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero. ++ 0x40 extra: hle_elision_buffer_full Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero. + name:move_elimination type:exclusive default:0x1 + 0x1 extra: int_eliminated Number of integer Move Elimination candidate uops that were eliminated. + 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. + 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. + 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. +-name:cpl_cycles type:exclusive default:ring0 ++name:cpl_cycles type:exclusive default:0x1 + 0x1 extra: ring0 Unhalted core cycles when the thread is in ring 0 + 0x2 extra: ring123 Unhalted core cycles when thread is in rings 1, 2, or 3 + 0x1 extra:cmask=1,edge ring0_trans Number of intervals between processor halts while thread is in ring 0 + name:tx_exec type:exclusive default:0x1 +- 0x1 extra: misc1 Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution it may not always cause a transactional abort. +- 0x2 extra: misc2 Counts the number of times a class of instructions that may cause a transactional abort was executed inside a transactional region +- 0x4 extra: misc3 Counts the number of times an instruction execution caused the nest count supported to be exceeded +- 0x8 extra: misc4 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region ++ 0x1 extra: misc1 Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort. ++ 0x2 extra: misc2 Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region ++ 0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded ++ 0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region. ++ 0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region ++name:rs_events type:exclusive default:0x1 ++ 0x1 extra: empty_cycles This event counts cycles when the Reservation Station ( RS ) is empty for the thread. The RS is a structure that buffers allocated micro-ops from the Front-end. If there are many cycles when the RS is empty, it may represent an underflow of instructions delivered from the Front-end. ++ 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++name:offcore_requests_outstanding type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. ++ 0x2 extra: demand_code_rd Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x4 extra: demand_rfo Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore ++ 0x8 extra: all_data_rd Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore ++ 0x1 extra:cmask=1 cycles_with_demand_data_rd Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore ++ 0x8 extra:cmask=1 cycles_with_data_rd Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore + name:lock_cycles type:exclusive default:0x1 + 0x1 extra: split_lock_uc_lock_duration Cycles when L1 and L2 are locked due to UC or split lock + 0x2 extra: cache_lock_duration Cycles when L1D is locked +@@ -99,8 +114,8 @@ name:idq type:exclusive default:0x2 + 0x8 extra: dsb_uops Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path + 0x10 extra: ms_dsb_uops Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy + 0x20 extra: ms_mite_uops Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy +- 0x30 extra: ms_uops Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy +- 0x30 extra:cmask=1 ms_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x30 extra: ms_uops This event counts uops delivered by the Front-end with the assistance of the microcode sequencer. Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder. Using other instructions, if possible, will usually improve performance. ++ 0x30 extra:cmask=1 ms_cycles This event counts cycles during which the microcode sequencer assisted the Front-end in delivering uops. Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder. Using other instructions, if possible, will usually improve performance. + 0x4 extra:cmask=1 mite_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path + 0x8 extra:cmask=1 dsb_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path + 0x10 extra:cmask=1 ms_dsb_cycles Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy +@@ -110,17 +125,21 @@ name:idq type:exclusive default:0x2 + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering any Uop + 0x3c extra: mite_all_uops Uops delivered to Instruction Decode Queue (IDQ) from MITE path ++ 0x30 extra:cmask=1,edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++name:icache type:exclusive default:0x2 ++ 0x2 extra: misses This event counts Instruction Cache (ICACHE) misses. ++ 0x4 extra: ifetch_stall Cycles where a code-fetch stalled due to L1 instruction-cache miss or an iTLB miss + name:itlb_misses type:exclusive default:0x1 + 0x1 extra: miss_causes_a_walk Misses at all ITLB levels that cause page walks +- 0xe extra: walk_completed Misses in all ITLB levels that cause completed page walks + 0x2 extra: walk_completed_4k Code miss in all TLB levels causes a page walk that completes. (4K) + 0x4 extra: walk_completed_2m_4m Code miss in all TLB levels causes a page walk that completes. (2M/4M) +- 0x10 extra: walk_duration Cycles when PMH is busy with page walks +- 0x60 extra: stlb_hit Operations that miss the first ITLB level but hit the second and do not cause any page walks ++ 0x10 extra: walk_duration This event counts cycles when the page miss handler (PMH) is servicing page walks caused by ITLB misses. + 0x20 extra: stlb_hit_4k Core misses that miss the DTLB and hit the STLB (4K) + 0x40 extra: stlb_hit_2m Code misses that miss the DTLB and hit the STLB (2M) ++ 0xe extra: walk_completed Misses in all ITLB levels that cause completed page walks ++ 0x60 extra: stlb_hit Operations that miss the first ITLB level but hit the second and do not cause any page walks + name:ild_stall type:exclusive default:0x1 +- 0x1 extra: lcp Stalls caused by changing prefix length of the instruction. ++ 0x1 extra: lcp This event counts cycles where the decoder is stalled on an instruction with a length changing prefix (LCP). + 0x4 extra: iq_full Stall cycles because IQ is full + name:br_inst_exec type:exclusive default:0xff + 0xff extra: all_branches Speculative and retired branches +@@ -145,14 +164,14 @@ name:br_misp_exec type:exclusive default:0xff + 0xc1 extra: all_conditional Speculative and retired mispredicted macro conditional branches + 0xc4 extra: all_indirect_jump_non_call_ret Mispredicted indirect branches excluding calls and returns + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls +-name:idq_uops_not_delivered type:exclusive default:core +- 0x1 extra: core Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled +- 0x1 extra:cmask=4 cycles_0_uops_deliv_core Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled +- 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled ++name:idq_uops_not_delivered type:exclusive default:0x1 ++ 0x1 extra: core This event count the number of undelivered (unallocated) uops from the Front-end to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. The Front-end can allocate up to 4 uops per cycle so this event can increment 0-4 times per cycle depending on the number of unallocated uops. This event is counted on a per-core basis. ++ 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts the number cycles during which the Front-end allocated exactly zero uops to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. This event is counted on a per-core basis. ++ 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled + 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end. + 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end. + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +-name:uops_executed_port type:exclusive default:port_0 ++name:uops_executed_port type:exclusive default:0x1 + 0x1 extra: port_0 Cycles per thread when uops are executed in port 0 + 0x2 extra: port_1 Cycles per thread when uops are executed in port 1 + 0x4 extra: port_2 Cycles per thread when uops are executed in port 2 +@@ -172,88 +191,100 @@ name:uops_executed_port type:exclusive default:port_0 + name:resource_stalls type:exclusive default:0x1 + 0x1 extra: any Resource-related stall cycles + 0x4 extra: rs Cycles stalled due to no eligible RS entry available. +- 0x8 extra: sb Cycles stalled due to no store buffers available. (not including draining form sync). ++ 0x8 extra: sb This event counts cycles during which no instructions were allocated because no Store Buffers (SB) were available. + 0x10 extra: rob Cycles stalled due to re-order buffer full. +-name:cycle_activity type:exclusive default:0x8 ++name:cycle_activity type:exclusive default:0x1 ++ 0x1 extra:cmask=1 cycles_l2_pending Cycles with pending L2 cache miss loads. + 0x8 extra:cmask=8 cycles_l1d_pending Cycles with pending L1 cache miss loads. + 0x2 extra:cmask=2 cycles_ldm_pending Cycles with pending memory loads. +- 0x4 extra:cmask=4 cycles_no_execute Total execution stalls +- 0x6 extra:cmask=6 stalls_ldm_pending Execution stalls due to memory subsystem. +-name:offcore_requests type:exclusive default:0x2 ++ 0x4 extra:cmask=4 cycles_no_execute This event counts cycles during which no instructions were executed in the execution stage of the pipeline. ++ 0x5 extra:cmask=5 stalls_l2_pending Execution stalls due to L2 cache misses. ++ 0x6 extra:cmask=6 stalls_ldm_pending This event counts cycles during which no instructions were executed in the execution stage of the pipeline and there were memory instructions pending (waiting for data). ++ 0xc extra:cmask=c stalls_l1d_pending Execution stalls due to L1 data cache misses ++name:offcore_requests type:exclusive default:0x1 ++ 0x1 extra: demand_data_rd Demand Data Read requests sent to uncore + 0x2 extra: demand_code_rd Cacheable and noncachaeble code read requests + 0x4 extra: demand_rfo Demand RFO requests including regular RFOs, locks, ItoM + 0x8 extra: all_data_rd Demand and prefetch data reads +-name:uops_executed type:exclusive default:thread +- 0x1 extra: thread Counts the number of uops to be executed per-thread each cycle. +- 0x2 extra: core Number of uops executed on the core. ++name:uops_executed type:exclusive default:0x2 ++ 0x2 extra: core Number of uops executed on the core. Errata: HSM31 + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatched to be executed on this thread. +- 0x1 extra:cmask=1,inv cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread +- 0x1 extra:cmask=1,inv cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread +- 0x1 extra:cmask=1,inv cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread +- 0x1 extra:cmask=1,inv cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++ 0x1 extra:cmask=1 cycles_ge_1_uops_exec This events counts the cycles where at least one uop was executed. It is counted per thread. Errata: HSM31 ++ 0x1 extra:cmask=2 cycles_ge_2_uops_exec This events counts the cycles where at least two uop were executed. It is counted per thread. Errata: HSM31 ++ 0x1 extra:cmask=3 cycles_ge_3_uops_exec This events counts the cycles where at least three uop were executed. It is counted per thread. Errata: HSM31 ++ 0x1 extra:cmask=4 cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread Errata: HSM31 + name:page_walker_loads type:exclusive default:0x11 +- 0x11 extra: ia32_dtlb_l1 Number of DTLB page walker hits in the L1+FB +- 0x21 extra: ia32_itlb_l1 Number of ITLB page walker hits in the L1+FB +- 0x12 extra: ia32_dtlb_l2 Number of DTLB page walker hits in the L2 +- 0x22 extra: ia32_itlb_l2 Number of ITLB page walker hits in the L2 +- 0x14 extra: ia32_dtlb_l3 Number of DTLB page walker hits in the L3 + XSNP +- 0x24 extra: ia32_itlb_l3 Number of ITLB page walker hits in the L3 + XSNP +- 0x18 extra: ia32_dtlb_memory Number of DTLB page walker hits in Memory +- 0x28 extra: ia32_itlb_memory Number of ITLB page walker hits in Memory ++ 0x11 extra: dtlb_l1 Number of DTLB page walker hits in the L1+FB ++ 0x21 extra: itlb_l1 Number of ITLB page walker hits in the L1+FB ++ 0x41 extra: ept_dtlb_l1 Counts the number of Extended Page Table walks from the DTLB that hit in the L1 and FB. ++ 0x81 extra: ept_itlb_l1 Counts the number of Extended Page Table walks from the ITLB that hit in the L1 and FB. ++ 0x12 extra: dtlb_l2 Number of DTLB page walker hits in the L2 ++ 0x22 extra: itlb_l2 Number of ITLB page walker hits in the L2 ++ 0x42 extra: ept_dtlb_l2 Counts the number of Extended Page Table walks from the DTLB that hit in the L2. ++ 0x82 extra: ept_itlb_l2 Counts the number of Extended Page Table walks from the ITLB that hit in the L2. ++ 0x14 extra: dtlb_l3 Number of DTLB page walker hits in the L3 + XSNP ++ 0x24 extra: itlb_l3 Number of ITLB page walker hits in the L3 + XSNP ++ 0x44 extra: ept_dtlb_l3 Counts the number of Extended Page Table walks from the DTLB that hit in the L3. ++ 0x84 extra: ept_itlb_l3 Counts the number of Extended Page Table walks from the ITLB that hit in the L2. ++ 0x18 extra: dtlb_memory Number of DTLB page walker hits in Memory ++ 0x48 extra: ept_dtlb_memory Counts the number of Extended Page Table walks from the DTLB that hit in memory. ++ 0x88 extra: ept_itlb_memory Counts the number of Extended Page Table walks from the ITLB that hit in memory. + name:tlb_flush type:exclusive default:0x1 + 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries + 0x20 extra: stlb_any STLB flush attempts + name:other_assists type:exclusive default:0x8 +- 0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. +- 0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. ++ 0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. Errata: HSM57 ++ 0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. Errata: HSM57 + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. +-name:uops_retired type:exclusive default:all +- 0x1 extra: all Actually retired uops. +- 0x2 extra: retire_slots Retirement slots used. +- 0x1 extra:pebs all_ps Actually retired uops. (Precise Event - PEBS) +- 0x2 extra:pebs retire_slots_ps Retirement slots used. (Precise Event - PEBS) +- 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. +- 0x1 extra:cmask=10,inv total_cycles Cycles with less than 10 actually retired uops. +- 0x1 extra:cmask=1,inv,any core_stall_cycles Cycles without actually retired uops. +-name:machine_clears type:exclusive default:0x2 +- 0x2 extra: memory_ordering Counts the number of machine clears due to memory order conflicts. +- 0x4 extra: smc Self-modifying code (SMC) detected. +- 0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0. +-name:br_inst_retired type:exclusive default:all_branches_ps +- 0x1 extra: conditional Conditional branch instructions retired. +- 0x2 extra: near_call Direct and indirect near call instructions retired. +- 0x8 extra: near_return Return instructions retired. +- 0x10 extra: not_taken Not taken branch instructions retired. +- 0x20 extra: near_taken Taken branch instructions retired. +- 0x40 extra: far_branch Far branch instructions retired. +- 0x1 extra:pebs conditional_ps Conditional branch instructions retired. (Precise Event - PEBS) +- 0x2 extra:pebs near_call_ps Direct and indirect near call instructions retired. (Precise Event - PEBS) +- 0x4 extra:pebs all_branches_ps All (macro) branch instructions retired. (Precise Event - PEBS) +- 0x8 extra:pebs near_return_ps Return instructions retired. (Precise Event - PEBS) +- 0x20 extra:pebs near_taken_ps Taken branch instructions retired. (Precise Event - PEBS) +- 0x2 extra: near_call_r3 Direct and indirect macro near call instructions retired (captured in ring 3). +- 0x2 extra:pebs near_call_r3_ps Direct and indirect macro near call instructions retired (captured in ring 3). (Precise Event - PEBS) +-name:br_misp_retired type:exclusive default:all_branches_ps +- 0x1 extra: conditional Mispredicted conditional branch instructions retired. +- 0x1 extra:pebs conditional_ps Mispredicted conditional branch instructions retired. (Precise Event - PEBS) +- 0x4 extra:pebs all_branches_ps Mispredicted macro branch instructions retired. (Precise Event - PEBS) +- 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. +- 0x20 extra:pebs near_taken_ps number of near branch instructions retired that were mispredicted and taken. (Precise Event - PEBS) ++name:uops_retired type:exclusive default:0x1 ++ 0x1 extra: all Actually retired uops. ++ 0x1 extra: all_pebs Actually retired uops. ++ 0x2 extra: retire_slots This event counts the number of retirement slots used each cycle. There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle. ++ 0x2 extra: retire_slots_pebs This event counts the number of retirement slots used each cycle. There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle. ++ 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. ++ 0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops. ++ 0x1 extra:cmask=1,inv core_stall_cycles Cycles without actually retired uops. ++name:machine_clears type:exclusive default:0x1 ++ 0x1 extra: cycles Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes. ++ 0x2 extra: memory_ordering This event counts the number of memory ordering machine clears detected. Memory ordering machine clears can result from memory address aliasing or snoops from another hardware thread or core to data inflight in the pipeline. Machine clears can have a significant performance impact if they are happening frequently. ++ 0x4 extra: smc This event is incremented when self-modifying code (SMC) is detected, which causes a machine clear. Machine clears can have a significant performance impact if they are happening frequently. ++ 0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0. ++ 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. ++name:br_inst_retired type:exclusive default:0x1 ++ 0x1 extra: conditional Conditional branch instructions retired. ++ 0x1 extra: conditional_pebs Conditional branch instructions retired. ++ 0x2 extra: near_call Direct and indirect near call instructions retired. ++ 0x2 extra: near_call_pebs Direct and indirect near call instructions retired. ++ 0x8 extra: near_return Return instructions retired. ++ 0x8 extra: near_return_pebs Return instructions retired. ++ 0x10 extra: not_taken Not taken branch instructions retired. ++ 0x20 extra: near_taken Taken branch instructions retired. ++ 0x20 extra: near_taken_pebs Taken branch instructions retired. ++ 0x40 extra: far_branch Far branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired. ++name:br_misp_retired type:exclusive default:0x1 ++ 0x1 extra: conditional Mispredicted conditional branch instructions retired. ++ 0x1 extra: conditional_pebs Mispredicted conditional branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs This event counts all mispredicted branch instructions retired. This is a precise event. ++ 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++ 0x20 extra: near_taken_pebs number of near branch instructions retired that were mispredicted and taken. + name:hle_retired type:exclusive default:0x1 + 0x1 extra: start Number of times an HLE execution started. + 0x2 extra: commit Number of times an HLE execution successfully committed +- 0x4 extra: aborted Number of times an HLE execution aborted due to any reasons (multiple categories may count as one) +- 0x8 extra: aborted_misc1 Number of times an HLE execution aborted due to 1 various memory events ++ 0x4 extra: aborted Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra: aborted_pebs Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts). + 0x10 extra: aborted_misc2 Number of times an HLE execution aborted due to uncommon conditions + 0x20 extra: aborted_misc3 Number of times an HLE execution aborted due to HLE-unfriendly instructions + 0x40 extra: aborted_misc4 Number of times an HLE execution aborted due to incompatible memory type +- 0x80 extra: aborted_misc5 Number of times an HLE execution aborted due to none of the previous categories (e.g. interrupt) ++ 0x80 extra: aborted_misc5 Number of times an HLE execution aborted due to none of the previous 4 categories (e.g. interrupts) + name:rtm_retired type:exclusive default:0x1 + 0x1 extra: start Number of times an RTM execution started. + 0x2 extra: commit Number of times an RTM execution successfully committed +- 0x4 extra: aborted Number of times an RTM execution aborted due to any reasons (multiple categories may count as one) +- 0x8 extra: aborted_misc1 Number of times an RTM execution aborted due to various memory events +- 0x10 extra: aborted_misc2 Number of times an RTM execution aborted due to uncommon conditions ++ 0x4 extra: aborted Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra: aborted_pebs Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts) ++ 0x10 extra: aborted_misc2 Number of times an RTM execution aborted due to various memory events (e.g., read/write capacity and conflicts). + 0x20 extra: aborted_misc3 Number of times an RTM execution aborted due to HLE-unfriendly instructions + 0x40 extra: aborted_misc4 Number of times an RTM execution aborted due to incompatible memory type + 0x80 extra: aborted_misc5 Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt) +@@ -263,51 +294,59 @@ name:fp_assist type:exclusive default:0x1e + 0x4 extra: x87_input Number of X87 assists due to input value. + 0x8 extra: simd_output Number of SIMD FP assists due to Output values + 0x10 extra: simd_input Number of SIMD FP assists due to input values +-name:mem_uops_retired type:exclusive default:all_loads +- 0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. +- 0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. +- 0x21 extra: lock_loads Load uops with locked access retired to architected path. +- 0x41 extra: split_loads Line-splitted load uops retired to architected path. +- 0x42 extra: split_stores Line-splitted store uops retired to architected path. +- 0x81 extra: all_loads Load uops retired to architected path with filter on bits 0 and 1 applied. +- 0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. +- 0x11 extra:pebs stlb_miss_loads_ps Load uops with true STLB miss retired to architected path. (Precise Event - PEBS) +- 0x12 extra:pebs stlb_miss_stores_ps Store uops true STLB miss retired to architected path. (Precise Event - PEBS) +- 0x21 extra:pebs lock_loads_ps Load uops with locked access retired to architected path. (Precise Event - PEBS) +- 0x41 extra:pebs split_loads_ps Line-splitted load uops retired to architected path. (Precise Event - PEBS) +- 0x42 extra:pebs split_stores_ps Line-splitted store uops retired to architected path. (Precise Event - PEBS) +- 0x81 extra:pebs all_loads_ps Load uops retired to architected path with filter on bits 0 and 1 applied. (Precise Event - PEBS) +- 0x82 extra:pebs all_stores_ps Store uops retired to architected path with filter on bits 0 and 1 applied. (Precise Event - PEBS) +-name:mem_load_uops_retired type:exclusive default:l1_hit +- 0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. +- 0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. +- 0x4 extra: l3_hit Retired load uops which data sources were data hits in LLC without snoops required. +- 0x10 extra: l2_miss Miss in mid-level (L2) cache. Excludes Unknown data-source. +- 0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. +- 0x1 extra:pebs l1_hit_ps Retired load uops with L1 cache hits as data sources. (Precise Event - PEBS) +- 0x2 extra:pebs l2_hit_ps Retired load uops with L2 cache hits as data sources. (Precise Event - PEBS) +- 0x4 extra:pebs l3_hit_ps Miss in last-level (L3) cache. Excludes Unknown data-source. (Precise Event - PEBS) +- 0x40 extra:pebs hit_lfb_ps Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. (Precise Event - PEBS) +-name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss +- 0x1 extra: xsnp_miss Retired load uops which data sources were LLC hit and cross-core snoop missed in on-pkg core cache. +- 0x2 extra: xsnp_hit Retired load uops which data sources were LLC and cross-core snoop hits in on-pkg core cache. +- 0x4 extra: xsnp_hitm Retired load uops which data sources were HitM responses from shared LLC. +- 0x8 extra: xsnp_none Retired load uops which data sources were hits in LLC without snoops required. +- 0x1 extra:pebs xsnp_miss_ps Retired load uops which data sources were LLC hit and cross-core snoop missed in on-pkg core cache. (Precise Event - PEBS) +- 0x2 extra:pebs xsnp_hit_ps Retired load uops which data sources were LLC and cross-core snoop hits in on-pkg core cache. (Precise Event - PEBS) +- 0x4 extra:pebs xsnp_hitm_ps Retired load uops which data sources were HitM responses from shared LLC. (Precise Event - PEBS) +- 0x8 extra:pebs xsnp_none_ps Retired load uops which data sources were hits in LLC without snoops required. (Precise Event - PEBS) ++name:mem_uops_retired type:exclusive default:0x11 ++ 0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x11 extra: stlb_miss_loads_pebs Load uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x12 extra: stlb_miss_stores_pebs Store uops with true STLB miss retired to architected path. Errata: HSM30 ++ 0x21 extra: lock_loads Load uops with locked access retired to architected path. Errata: HSM30 ++ 0x21 extra: lock_loads_pebs Load uops with locked access retired to architected path. Errata: HSM30 ++ 0x41 extra: split_loads Line-splitted load uops retired to architected path. Errata: HSM30 ++ 0x41 extra: split_loads_pebs Line-splitted load uops retired to architected path. Errata: HSM30 ++ 0x42 extra: split_stores Line-splitted store uops retired to architected path. Errata: HSM30 ++ 0x42 extra: split_stores_pebs Line-splitted store uops retired to architected path. Errata: HSM30 ++ 0x81 extra: all_loads Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++ 0x81 extra: all_loads_pebs Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++ 0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++ 0x82 extra: all_stores_pebs Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 ++name:mem_load_uops_retired type:exclusive default:0x1 ++ 0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. Errata: HSM30 ++ 0x1 extra: l1_hit_pebs Retired load uops with L1 cache hits as data sources. Errata: HSM30 ++ 0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. Errata: HSM30 ++ 0x2 extra: l2_hit_pebs Retired load uops with L2 cache hits as data sources. Errata: HSM30 ++ 0x4 extra: l3_hit Retired load uops which data sources were data hits in L3 without snoops required. Errata: HSM26, HSM30 ++ 0x4 extra: l3_hit_pebs Retired load uops which data sources were data hits in L3 without snoops required. Errata: HSM26, HSM30 ++ 0x8 extra: l1_miss Retired load uops misses in L1 cache as data sources. Errata: HSM30 ++ 0x8 extra: l1_miss_pebs Retired load uops misses in L1 cache as data sources. Errata: HSM30 ++ 0x10 extra: l2_miss Miss in mid-level (L2) cache. Excludes Unknown data-source. Errata: HSM30 ++ 0x10 extra: l2_miss_pebs Miss in mid-level (L2) cache. Excludes Unknown data-source. Errata: HSM30 ++ 0x20 extra: l3_miss Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30 ++ 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30 ++ 0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 ++ 0x40 extra: hit_lfb_pebs Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 ++name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++ 0x1 extra: xsnp_miss Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x1 extra: xsnp_miss_pebs Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x2 extra: xsnp_hit Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x2 extra: xsnp_hit_pebs Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30 ++ 0x4 extra: xsnp_hitm Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30 ++ 0x4 extra: xsnp_hitm_pebs Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30 ++ 0x8 extra: xsnp_none Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 ++ 0x8 extra: xsnp_none_pebs Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 ++name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++ 0x1 extra: local_dram This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 ++ 0x1 extra: local_dram_pebs This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 + name:l2_trans type:exclusive default:0x80 + 0x80 extra: all_requests Transactions accessing L2 pipe + 0x1 extra: demand_data_rd Demand Data Read requests that access L2 cache + 0x2 extra: rfo RFO requests that access L2 cache + 0x4 extra: code_rd L2 cache accesses when fetching instructions +- 0x8 extra: all_pf L2 or LLC HW prefetches that access L2 cache ++ 0x8 extra: all_pf L2 or L3 HW prefetches that access L2 cache + 0x10 extra: l1d_wb L1D writebacks that access L2 cache + 0x20 extra: l2_fill L2 fill requests that access L2 cache + 0x40 extra: l2_wb L2 writebacks that access L2 cache + name:l2_lines_in type:exclusive default:0x7 +- 0x7 extra: all L2 cache lines filling L2 ++ 0x7 extra: all This event counts the number of L2 cache lines brought into the L2 cache. Lines are filled into the L2 cache when there was an L2 miss. + 0x1 extra: i L2 cache lines in I state filling L2 + 0x2 extra: s L2 cache lines in S state filling L2 + 0x4 extra: e L2 cache lines in E state filling L2 diff --git a/SOURCES/oprofile-num_symbolic.patch b/SOURCES/oprofile-num_symbolic.patch new file mode 100644 index 0000000..c081a22 --- /dev/null +++ b/SOURCES/oprofile-num_symbolic.patch @@ -0,0 +1,128 @@ +From 6f10a5b14f5b7f43568d109633533a8ecc057fc6 Mon Sep 17 00:00:00 2001 +From: Lars Friend +Date: Tue, 15 Oct 2013 01:14:53 -0400 +Subject: [PATCH] Allow events with extra flags to also set unit_mask + +Older distributions may be running kernels that still use the +/dev/opcontrol interface. On an Intel Ivy Bridge machine and similar +processors may want to do something like: + +opcontrol --setup --no-vmlinux \ + --event CPU_CLK_UNHALTED:2000000:0:0:1 \ + --event uops_executed:2000000:stall_cycles:0:1 + +For the uops_executed event in the above example need to both set the +extra and the unit_mask bits. The current code in opcontrol would +never set the unit_mask bits when the extra bits were set. This +change allows both to be set when required. + +Signed-off-by: William Cohen +--- + doc/ophelp.1.in | 4 ++++ + utils/opcontrol | 9 +++++++-- + utils/ophelp.c | 27 ++++++++++++++++++++++++++- + 3 files changed, 37 insertions(+), 3 deletions(-) + +diff --git a/doc/ophelp.1.in b/doc/ophelp.1.in +index 083cc85..97383bf 100644 +--- a/doc/ophelp.1.in ++++ b/doc/ophelp.1.in +@@ -49,6 +49,10 @@ Show the default unit mask for the given event. + Show the default unit mask for the given event. + .br + .TP ++.BI "--symbolic-unit-mask / -U [event]" ++Show the numerical unit and extra mask for given event. ++.br ++.TP + .BI "--extra-mask / -E [event]" + Show the extra unit mask for given event. + .br +diff --git a/utils/opcontrol b/utils/opcontrol +index 38bb1ac..a3a6a3c 100644 +--- a/utils/opcontrol ++++ b/utils/opcontrol +@@ -1522,9 +1522,14 @@ do_param_setup() + set_ctr_param $CTR count $COUNT + set_ctr_param $CTR kernel $KERNEL + set_ctr_param $CTR user $USER +- set_ctr_param $CTR unit_mask $UNIT_MASK + +- EXTRA=`$OPHELP --extra-mask $EVENT:$COUNT:$UNIT_MASK_NAMED` ++ # Resolve a [potentially] symbolic unit mask to a numeric ++ # unit mask and extra mask. ++ TMP_SYMBOLIC="`$OPHELP --symbolic-unit-mask $EVENT:$COUNT:$UNIT_MASK`" ++ UNIT_MASK_NUM=`echo $TMP_SYMBOLIC | awk '{print $1}'` ++ EXTRA=`echo $TMP_SYMBOLIC | awk '{print $2}'` ++ set_ctr_param $CTR unit_mask $UNIT_MASK_NUM ++ + if test "$EXTRA" -ne 0 ; then + # A value >= 0x40000 returned by 'ophelp --extra-mask' (EXTRA_MIN_VAL) is interpreted + # as a valid extra value; otherwise we interpret as a simple unit mask value +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 7543c6f..f77a19a 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -282,6 +282,22 @@ static void resolve_events(void) + free(counter_map); + } + ++static void resolve_symbolic_unit_mask(void) ++{ ++ size_t count; ++ unsigned extra = 0; ++ ++ count = parse_events(parsed_events, num_chosen_events, chosen_events, ++ ignore_count ? 0 : 1); ++ if (count > 1) { ++ fprintf(stderr, "More than one event specified.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ op_resolve_unit_mask(parsed_events, &extra); ++ ++ printf("%d %d\n", parsed_events[0].unit_mask, extra); ++} + + static void show_unit_mask(void) + { +@@ -334,6 +349,7 @@ static int check_events; + static int unit_mask; + static int get_default_event; + static int extra_mask; ++static int symbolic_unit_mask; + + static struct poptOption options[] = { + { "cpu-type", 'c', POPT_ARG_STRING, &cpu_string, 0, +@@ -356,6 +372,9 @@ static struct poptOption options[] = { + "list events as XML", NULL, }, + { "extra-mask", 'E', POPT_ARG_NONE, &extra_mask, 0, + "print extra mask for event", NULL, }, ++ { "symbolic-unit-mask", 'U', POPT_ARG_NONE, &symbolic_unit_mask, 0, ++ "resolve an event with symbolic unit mask into numeric unit " ++ "and extra masks", NULL, }, + POPT_AUTOHELP + { NULL, 0, 0, NULL, 0, NULL, NULL, }, + }; +@@ -457,11 +476,17 @@ int main(int argc, char const * argv[]) + + events = op_events(cpu_type); + +- if (!chosen_events && (unit_mask || check_events || extra_mask)) { ++ if (!chosen_events && (unit_mask || check_events || extra_mask || ++ symbolic_unit_mask)) { + fprintf(stderr, "No events given.\n"); + exit(EXIT_FAILURE); + } + ++ if (symbolic_unit_mask) { ++ resolve_symbolic_unit_mask(); ++ exit(EXIT_SUCCESS); ++ } ++ + if (unit_mask) { + show_unit_mask(); + exit(EXIT_SUCCESS); +-- +1.8.3.1 + diff --git a/SOURCES/oprofile-power8.patch b/SOURCES/oprofile-power8.patch new file mode 100644 index 0000000..91e6fb8 --- /dev/null +++ b/SOURCES/oprofile-power8.patch @@ -0,0 +1,1376 @@ +commit 3795ee4a10c11e16c8d13b5a5d7a6f10615a40d5 +Author: Maynard Johnson +Date: Wed Sep 25 11:15:30 2013 -0500 + + Add two new POWER8 events that are needed for stall analysis + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 994dc27..9c96949 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -54,6 +54,7 @@ event:0x3e050 counters:2 um:zero minimum:10000 name:PM_DC_PREF_STREAM_STRIDED_CO + event:0x4d01e counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED : Gct empty fo this thread due to branch mispred. + event:0x4d01a counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED_ICMISS : Gct empty fo this thread due to Icache Miss and branch mispred. + event:0x2d01e counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_ISSQ : Gct empty fo this thread due to dispatch hold on this thread due to Issue q full. ++event:0x4d01c counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_MAP : Gct empty fo this thread due to dispatch hold on this thread due to Mapper full. + event:0x2e010 counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_OTHER : Gct empty fo this thread due to dispatch hold on this thread due to sync. + event:0x2d01c counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_SRQ : Gct empty fo this thread due to dispatch hold on this thread due to SRQ full. + event:0x4e010 counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_L3MISS : Gct empty fo this thread due to icach l3 miss. +@@ -87,6 +88,7 @@ event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked In + event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. + event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count ). + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. ++event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed + event:0x20010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1. + event:0x30010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2. + event:0x40010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3. +commit 717d4595a0d60faffeb8b9611dda850e3f998ef8 +Author: Maynard Johnson +Date: Mon Feb 3 17:50:54 2014 -0600 + + Fix up event codes for marked architected events + + Fourteen events in the set of architected events had the wrong + event encoding. All 14 were "marked" events, used in random + sampling. + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index 1048ec9..465cbbd 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -30,20 +30,20 @@ event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : + event:0x200fc counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss + event:0x400f0 counters:3 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1 + event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded due to a DERAT miss +-event:0x300e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted +-event:0x100e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed +-event:0x400e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : sampled load resolved beyond L2 +-event:0x200e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : sampled load resolved beyond L3 +-event:0x200e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : sampled load resolved from memory +-event:0x300e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes +-event:0x400e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : sampled Instruction dtlb miss +-event:0x400e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : Marked group complete +-event:0x100e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction +-event:0x400e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : sampled instruction missed icache and came from beyond L3 A Instruction cacheline request for a marked/sampled instruction resolved from a location that was beyond the local L3 cache +-event:0x100e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss +-event:0x100ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Sampled Instruction had a data reload +-event:0x200e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss +-event:0x300e2 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL : marked store completed and sent to nest ++event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted ++event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed ++event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : sampled load resolved beyond L2 ++event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : sampled load resolved beyond L3 ++event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : sampled load resolved from memory ++event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes ++event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : sampled Instruction dtlb miss ++event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : Marked group complete ++event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction ++event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : sampled instruction missed icache and came from beyond L3 A Instruction cacheline request for a marked/sampled instruction resolved from a location that was beyond the local L3 cache ++event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss ++event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Sampled Instruction had a data reload ++event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss ++event:0x301e2 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL : marked store completed and sent to nest + event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles + event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions + event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR +commit 029735879c7ff3ec23aa97dec5ffd95867836cdb +Author: Maynard Johnson +Date: Fri Feb 7 08:58:28 2014 -0600 + + Fix various event names and codes for IBM architected and POWER8 events + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index 465cbbd..f8a9efb 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -8,55 +8,55 @@ + # Manually add CYCLES for backward compatibility for default event + event:0x100f0 counters:0 um:zero minimum:100000 name:CYCLES : Cycles + +-event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : one or more ppc instructions finished +-event:0x400f2 counters:3 um:zero minimum:100000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched +-event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : One of threads in run_cycles +-event:0x400f6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts +-event:0x200fa counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : New event for Branch Taken +-event:0x100f0 counters:0 um:zero minimum:100000 name:PM_CYC : Cycles +-event:0x200fe counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit) +-event:0x300fe counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit) +-event:0x400fe counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEM : data from Memory +-event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG reload +-event:0x200f8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt +-event:0x100f4 counters:0 um:zero minimum:10000 name:PM_FLOP : Floating Point Operations Finished +-event:0x400f8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type) +-event:0x100f8 counters:0 um:zero minimum:10000 name:PM_GCT_NOSLOT_CYC : No itags assigned +-event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_MISS : Cycles Instruction ERAT was reloaded +-event:0x200f2 counters:1 um:zero minimum:100000 name:PM_INST_DISP : Number of PPC Dispatched +-event:0x300fa counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : A Instruction cacheline request resolved from a location that was beyond the local L3 cache +-event:0x400fc counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded (always zero on POWER6) +-event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load +-event:0x200fc counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss +-event:0x400f0 counters:3 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1 +-event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded due to a DERAT miss +-event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted +-event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken completed +-event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : sampled load resolved beyond L2 +-event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : sampled load resolved beyond L3 +-event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : sampled load resolved from memory +-event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes +-event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : sampled Instruction dtlb miss +-event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : Marked group complete +-event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : The thread has dispatched a randomly sampled marked instruction +-event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : sampled instruction missed icache and came from beyond L3 A Instruction cacheline request for a marked/sampled instruction resolved from a location that was beyond the local L3 cache +-event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : sampled Instruction suffered an icache Miss +-event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Sampled Instruction had a data reload +-event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss +-event:0x301e2 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL : marked store completed and sent to nest +-event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles +-event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions +-event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR +-event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructions Finished +-event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1 +-event:0x300f8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event +-event:0x300f4 counters:2 um:zero minimum:100000 name:PM_THRD_CONC_RUN_INST : PPC Instructions Finished when both threads in run_cycles +-event:0x300ea counters:2 um:zero minimum:10000 name:PM_THRESH_EXC_1024 : Threshold counter exceeded a value of 1024 Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 1024 +-event:0x400ea counters:3 um:zero minimum:10000 name:PM_THRESH_EXC_128 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 128 +-event:0x400ec counters:3 um:zero minimum:10000 name:PM_THRESH_EXC_2048 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 2048 +-event:0x100e8 counters:0 um:zero minimum:10000 name:PM_THRESH_EXC_256 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 256 +-event:0x200e6 counters:1 um:zero minimum:10000 name:PM_THRESH_EXC_32 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 32 +-event:0x100e6 counters:0 um:zero minimum:10000 name:PM_THRESH_EXC_4096 : Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 4096 +-event:0x200e8 counters:1 um:zero minimum:10000 name:PM_THRESH_EXC_512 : Threshold counter exceeded a value of 512 Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 512 +-event:0x300e8 counters:2 um:zero minimum:10000 name:PM_THRESH_EXC_64 : Threshold counter exceeded a value of 64 Architecture provides a thresholding counter in MMCRA, it has a start and stop events to configure and a programmable threshold, this event increments when the threshold exceeded a count of 64 +-event:0x100ec counters:0 um:zero minimum:10000 name:PM_THRESH_MET : Threshold exceeded ++event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished (completed). ++event:0x400f2 counters:3 um:zero minimum:100000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched. Could be a group with only microcode. Issue HW016521 ++event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : Any thread in run_cycles (was one thread in run_cycles). ++event:0x400f6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts. ++event:0x200fa counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : Branch Taken. ++event:0x1e counters:0,1,2,3 um:zero minimum:100000 name:PM_CYC : Cycles. ++event:0x200fe counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit). ++event:0x300fe counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit). ++event:0x400fe counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEM : Data cache reload from memory (including L4). ++event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG Reloaded (DTLB Miss). ++event:0x200f8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt. ++event:0x100f4 counters:0 um:zero minimum:10000 name:PM_FLOP : Floating Point Operations Finished. ++event:0x400f8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type). ++event:0x100f8 counters:0 um:zero minimum:10000 name:PM_GCT_NOSLOT_CYC : Pipeline empty (No itags assigned , no GCT slots used). ++event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : IERAT Reloaded (Miss). ++event:0x200f2 counters:1 um:zero minimum:100000 name:PM_INST_DISP : PPC Dispatched. ++event:0x300fa counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : Inst from L3 miss. ++event:0x400fc counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded. ++event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load . ++event:0x200fd counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss. ++event:0x3e054 counters:2 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1. ++event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). ++event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted. ++event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken. ++event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : Data cache reload L2 miss. ++event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. ++event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. ++event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes. ++event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : Marked dtlb miss. ++event:0x401e0 counters:3 um:zero minimum:1000 name:PM_MRK_INST_CMPL : marked instruction completed. ++event:0x101e0 counters:0 um:zero minimum:1000 name:PM_MRK_INST_DISP : Marked Instruction dispatched. ++event:0x401e6 counters:3 um:zero minimum:1000 name:PM_MRK_INST_FROM_L3MISS : n/a ++event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : Marked L1 Icache Miss. ++event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload. ++event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. ++event:0x10134 counters:0 um:zero minimum:1000 name:PM_MRK_ST_CMPL : Marked store completed. ++event:0x60005 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. ++event:0x50009 counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. ++event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR. ++event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructions Finished (store sent to nest). ++event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1. ++event:0x300f8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event. ++event:0x300f4 counters:2 um:zero minimum:100000 name:PM_THRD_CONC_RUN_INST : Concurrent Run Instructions. ++event:0x301ea counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x401ea counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_128 : Threshold counter exceeded a value of 128. ++event:0x401ec counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x101e8 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_256 : Threshold counter exceed a count of 256. ++event:0x201e6 counters:1 um:zero minimum:1000 name:PM_THRESH_EXC_32 : Threshold counter exceeded a value of 32. ++event:0x101e6 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_4096 : Threshold counter exceed a count of 4096. ++event:0x201e8 counters:1 um:zero minimum:1000 name:PM_THRESH_EXC_512 : Threshold counter exceeded a value of 512. ++event:0x301e8 counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_64 : Threshold counter exceeded a value of 64. ++event:0x101ec counters:0 um:zero minimum:10000 name:PM_THRESH_MET : threshold exceeded. +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 9c96949..54430b4 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -10,7 +10,7 @@ include:ppc64/architected_events_v1 + event:0x40036 counters:3 um:zero minimum:10000 name:PM_BR_2PATH : two path branch. + event:0x40060 counters:3 um:zero minimum:10000 name:PM_BR_CMPL : Branch Instruction completed. + event:0x40138 counters:3 um:zero minimum:10000 name:PM_BR_MRK_2PATH : marked two path branch. +-event:0x1e054 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL : Completion stall. ++event:0x4000a counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL : Completion stall. + event:0x4d018 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU : Completion stall due to a Branch Unit. + event:0x2d018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU_CRU : Completion stall due to IFU. + event:0x30026 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_COQ_FULL : Completion stall due to CO q full. +@@ -30,6 +30,7 @@ event:0x4d014 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LOAD_FINISH : + event:0x2c010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU : Completion stall by LSU instruction. + event:0x10036 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LWSYNC : completion stall due to isync/lwsync. + event:0x30028 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_MEM_ECC_DELAY : Completion stall due to mem ECC delay. ++event:0x2e01c counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NO_NTF : Completion stall due to nop + event:0x2e01e counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NTCG_FLUSH : Completion stall due to reject (load hit store). + event:0x30006 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_OTHER_CMPL : Instructions core completed while this thread was stalled. + event:0x4c010 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_REJECT : Completion stall due to LSU reject. +@@ -62,7 +63,7 @@ event:0x2d01a counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_MISS : Gct + event:0x3000a counters:2 um:zero minimum:100000 name:PM_GRP_DISP : dispatch_success (Group Dispatched). + event:0x10130 counters:0 um:zero minimum:10000 name:PM_GRP_MRK : Instruction marked in idu. + event:0x2000a counters:1 um:zero minimum:10000 name:PM_HV_CYC : cycles in hypervisor mode . +-event:0x10002 counters:0 um:zero minimum:100000 name:PM_INST_CMPL : PPC Instructions Finished (completed). ++event:0x2 counters:0,1,2,3 um:zero minimum:100000 name:PM_INST_CMPL : PPC Instructions Finished (completed). + event:0x10014 counters:0 um:zero minimum:100000 name:PM_IOPS_CMPL : IOPS Completed. + event:0x1002e counters:0 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed. + event:0x10062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread. +@@ -86,14 +87,13 @@ event:0x40130 counters:3 um:zero minimum:1000 name:PM_MRK_GRP_CMPL : marked inst + event:0x20130 counters:1 um:zero minimum:1000 name:PM_MRK_INST_DECODED : marked instruction decoded. Name from ISU? + event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2. + event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. +-event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count ). ++event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count #). + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. +-event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed ++event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed. + event:0x20010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1. + event:0x30010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2. + event:0x40010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3. + event:0x10010 counters:0 um:zero minimum:10000 name:PM_PMC4_OVERFLOW : Overflow from counter 4. + event:0x30024 counters:2 um:zero minimum:10000 name:PM_PMC6_OVERFLOW : Overflow from counter 6. +-event:0x40002 counters:3 um:zero minimum:10000 name:PM_PPC_CMPL : PPC Instructions Finished (completed). + event:0x2000c counters:1 um:zero minimum:100000 name:PM_THRD_ALL_RUN_CYC : All Threads in Run_cycles (was both threads in run_cycles). + event:0x4016e counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshold counter did not meet threshold. +commit 31389d9cf7c0946479065e0baf0efd52cc4ba1f4 +Author: Maynard Johnson +Date: Fri Feb 7 10:27:46 2014 -0600 + + Fix PM_RUN_CYC and PM_RUN_INST_CMPL event codes broken by previous commit + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index f8a9efb..fad6ca5 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -44,8 +44,8 @@ event:0x101e4 counters:0 um:zero minimum:1000 name:PM_MRK_L1_ICACHE_MISS : Marke + event:0x101ea counters:0 um:zero minimum:1000 name:PM_MRK_L1_RELOAD_VALID : Marked demand reload. + event:0x201e2 counters:1 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1 : Marked DL1 Demand Miss counted at exec time. + event:0x10134 counters:0 um:zero minimum:1000 name:PM_MRK_ST_CMPL : Marked store completed. +-event:0x60005 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. +-event:0x50009 counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. ++event:0x600f4 counters:5 um:zero minimum:100000 name:PM_RUN_CYC : Run_cycles. ++event:0x500fa counters:4 um:zero minimum:100000 name:PM_RUN_INST_CMPL : Run_Instructions. + event:0x400f4 counters:3 um:zero minimum:10000 name:PM_RUN_PURR : Run_PURR. + event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructions Finished (store sent to nest). + event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1. +commit f72665b5f28f0d098a985f29672823158c7e85d9 +Author: Maynard Johnson +Date: Wed May 14 13:50:12 2014 -0500 + + Update events for IBM POWER8 processor + + The initial support for the IBM POWER8 processor was added to oprofile in + May 2013. Some events were held back as their descriptions may have exposed + information about the POWER8 architecture that IBM wanted to remain private + until the official announcement. Some other events were held back because they + had not yet been verified. The POWER8 has now been announced and all events + have been verified, so we can now publish all events. + + Signed-off-by: Maynard Johnson + +diff --git a/events/ppc64/architected_events_v1/events b/events/ppc64/architected_events_v1/events +index fad6ca5..a52d9ee 100644 +--- a/events/ppc64/architected_events_v1/events ++++ b/events/ppc64/architected_events_v1/events +@@ -8,32 +8,32 @@ + # Manually add CYCLES for backward compatibility for default event + event:0x100f0 counters:0 um:zero minimum:100000 name:CYCLES : Cycles + +-event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished (completed). ++event:0x100f2 counters:0 um:zero minimum:100000 name:PM_1PLUS_PPC_CMPL : 1 or more ppc insts finished (completed). + event:0x400f2 counters:3 um:zero minimum:100000 name:PM_1PLUS_PPC_DISP : Cycles at least one Instr Dispatched. Could be a group with only microcode. Issue HW016521 +-event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : Any thread in run_cycles (was one thread in run_cycles). ++event:0x100fa counters:0 um:zero minimum:100000 name:PM_ANY_THRD_RUN_CYC : Any thread in run_cycles (was one thread in run_cycles). + event:0x400f6 counters:3 um:zero minimum:10000 name:PM_BR_MPRED_CMPL : Number of Branch Mispredicts. + event:0x200fa counters:1 um:zero minimum:10000 name:PM_BR_TAKEN_CMPL : Branch Taken. +-event:0x1e counters:0,1,2,3 um:zero minimum:100000 name:PM_CYC : Cycles. ++event:0x1e counters:0,1,2,3 um:zero minimum:100000 name:PM_CYC : Cycles . + event:0x200fe counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS : Demand LD - L2 Miss (not L2 hit). + event:0x300fe counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS : Demand LD - L3 Miss (not L2 hit and not L3 hit). + event:0x400fe counters:3 um:zero minimum:10000 name:PM_DATA_FROM_MEM : Data cache reload from memory (including L4). +-event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG Reloaded (DTLB Miss). ++event:0x300fc counters:2 um:zero minimum:10000 name:PM_DTLB_MISS : Data PTEG Reloaded (DTLB Miss). + event:0x200f8 counters:1 um:zero minimum:10000 name:PM_EXT_INT : external interrupt. + event:0x100f4 counters:0 um:zero minimum:10000 name:PM_FLOP : Floating Point Operations Finished. + event:0x400f8 counters:3 um:zero minimum:10000 name:PM_FLUSH : Flush (any type). + event:0x100f8 counters:0 um:zero minimum:10000 name:PM_GCT_NOSLOT_CYC : Pipeline empty (No itags assigned , no GCT slots used). +-event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : IERAT Reloaded (Miss). ++event:0x100f6 counters:0 um:zero minimum:10000 name:PM_IERAT_RELOAD : IERAT Reloaded (Miss). + event:0x200f2 counters:1 um:zero minimum:100000 name:PM_INST_DISP : PPC Dispatched. + event:0x300fa counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3MISS : Inst from L3 miss. + event:0x400fc counters:3 um:zero minimum:10000 name:PM_ITLB_MISS : ITLB Reloaded. +-event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load . ++event:0x300f6 counters:2 um:zero minimum:10000 name:PM_L1_DCACHE_RELOAD_VALID : DL1 reloaded due to Demand Load . + event:0x200fd counters:1 um:zero minimum:10000 name:PM_L1_ICACHE_MISS : Demand iCache Miss. + event:0x3e054 counters:2 um:zero minimum:10000 name:PM_LD_MISS_L1 : Load Missed L1. +-event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). ++event:0x200f6 counters:1 um:zero minimum:10000 name:PM_LSU_DERAT_MISS : DERAT Reloaded (Miss). + event:0x301e4 counters:2 um:zero minimum:1000 name:PM_MRK_BR_MPRED_CMPL : Marked Branch Mispredicted. + event:0x101e2 counters:0 um:zero minimum:1000 name:PM_MRK_BR_TAKEN_CMPL : Marked Branch Taken. + event:0x401e8 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS : Data cache reload L2 miss. +-event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. ++event:0x201e4 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a marked load. + event:0x201e0 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEM : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. + event:0x301e6 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS : Erat Miss (TLB Access) All page sizes. + event:0x401e4 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS : Marked dtlb miss. +@@ -51,9 +51,9 @@ event:0x200f0 counters:1 um:zero minimum:10000 name:PM_ST_FIN : Store Instructio + event:0x300f0 counters:2 um:zero minimum:10000 name:PM_ST_MISS_L1 : Store Missed L1. + event:0x300f8 counters:2 um:zero minimum:10000 name:PM_TB_BIT_TRANS : timebase event. + event:0x300f4 counters:2 um:zero minimum:100000 name:PM_THRD_CONC_RUN_INST : Concurrent Run Instructions. +-event:0x301ea counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x301ea counters:2 um:zero minimum:1000 name:PM_THRESH_EXC_1024 : Threshold counter exceeded a value of 1024. + event:0x401ea counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_128 : Threshold counter exceeded a value of 128. +-event:0x401ec counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x401ec counters:3 um:zero minimum:1000 name:PM_THRESH_EXC_2048 : Threshold counter exceeded a value of 2048. + event:0x101e8 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_256 : Threshold counter exceed a count of 256. + event:0x201e6 counters:1 um:zero minimum:1000 name:PM_THRESH_EXC_32 : Threshold counter exceeded a value of 32. + event:0x101e6 counters:0 um:zero minimum:1000 name:PM_THRESH_EXC_4096 : Threshold counter exceed a count of 4096. +diff --git a/events/ppc64/power8/events b/events/ppc64/power8/events +index 54430b4..6e4e688 100644 +--- a/events/ppc64/power8/events ++++ b/events/ppc64/power8/events +@@ -7,9 +7,52 @@ + + include:ppc64/architected_events_v1 + +-event:0x40036 counters:3 um:zero minimum:10000 name:PM_BR_2PATH : two path branch. ++event:0x1f05e counters:0 um:zero minimum:100000 name:PM_1LPAR_CYC : Number of cycles in single lpar mode. ++event:0x2006e counters:1 um:zero minimum:10000 name:PM_2LPAR_CYC : Number of cycles in 2 lpar mode. ++event:0x4e05e counters:3 um:zero minimum:100000 name:PM_4LPAR_CYC : Number of cycles in 4 LPAR mode. ++event:0x610050 counters:0 um:zero minimum:10000 name:PM_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for all data types ( demand load,data,inst prefetch,inst fetch,xlate (I or d) ++event:0x520050 counters:1 um:zero minimum:10000 name:PM_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x620052 counters:1 um:zero minimum:10000 name:PM_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x610052 counters:0 um:zero minimum:10000 name:PM_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x610054 counters:0 um:zero minimum:10000 name:PM_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x640052 counters:3 um:zero minimum:10000 name:PM_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x630050 counters:2 um:zero minimum:10000 name:PM_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x630052 counters:2 um:zero minimum:10000 name:PM_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x640050 counters:3 um:zero minimum:10000 name:PM_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate) ++event:0x2505e counters:1 um:zero minimum:10000 name:PM_BACK_BR_CMPL : Branch instruction completed with a target address less than current instruction address. ++event:0x4082 counters:0,1,2,3 um:zero minimum:10000 name:PM_BANK_CONFLICT : Read blocked due to interleave conflict. The ifar logic will detect an interleave conflict and kill the data that was read that cycle. ++event:0x10068 counters:0 um:zero minimum:10000 name:PM_BRU_FIN : Branch Instruction Finished . ++event:0x20036 counters:1 um:zero minimum:10000 name:PM_BR_2PATH : two path branch. ++event:0x5086 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_BC_8 : Pairable BC+8 branch that has not been converted to a Resolve Finished in the BRU pipeline ++event:0x5084 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_BC_8_CONV : Pairable BC+8 branch that was converted to a Resolve Finished in the BRU pipeline. + event:0x40060 counters:3 um:zero minimum:10000 name:PM_BR_CMPL : Branch Instruction completed. +-event:0x40138 counters:3 um:zero minimum:10000 name:PM_BR_MRK_2PATH : marked two path branch. ++event:0x40ac counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_CCACHE : Conditional Branch Completed that was Mispredicted due to the Count Cache Target Prediction ++event:0x40b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_CR : Conditional Branch Completed that was Mispredicted due to the BHT Direction Prediction (taken/not taken). ++event:0x40ae counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_LSTACK : Conditional Branch Completed that was Mispredicted due to the Link Stack Target Prediction ++event:0x40ba counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_MPRED_TA : Conditional Branch Completed that was Mispredicted due to the Target Address Prediction from the Count Cache or Link Stack. Only XL-form branches that resolved Taken set this event. ++event:0x10138 counters:0 um:zero minimum:10000 name:PM_BR_MRK_2PATH : marked two path branch. ++event:0x409c counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_BR0 : Conditional Branch Completed on BR0 (1st branch in group) in which the HW predicted the Direction or Target ++event:0x409e counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_BR1 : Conditional Branch Completed on BR1 (2nd branch in group) in which the HW predicted the Direction or Target. Note: BR1 can only be used in Single Thread Mode. In all of the SMT modes, only one branch can complete, thus BR1 is unused. ++event:0x489c counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_BR_CMPL : IFU ++event:0x40a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE_BR0 : Conditional Branch Completed on BR0 that used the Count Cache for Target Prediction ++event:0x40a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE_BR1 : Conditional Branch Completed on BR1 that used the Count Cache for Target Prediction ++event:0x48a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CCACHE_CMPL : IFU ++event:0x40b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CR_BR0 : Conditional Branch Completed on BR0 that had its direction predicted. I-form branches do not set this event. In addition, B-form branches which do not use the BHT do not set this event - these are branches with BO-field set to 'always taken' and bra ++event:0x40b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CR_BR1 : Conditional Branch Completed on BR1 that had its direction predicted. I-form branches do not set this event. In addition, B-form branches which do not use the BHT do not set this event - these are branches with BO-field set to 'always taken' and bra ++event:0x48b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_CR_CMPL : IFU ++event:0x40a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK_BR0 : Conditional Branch Completed on BR0 that used the Link Stack for Target Prediction ++event:0x40aa counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK_BR1 : Conditional Branch Completed on BR1 that used the Link Stack for Target Prediction ++event:0x48a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_LSTACK_CMPL : IFU ++event:0x40b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA_BR0 : Conditional Branch Completed on BR0 that had its target address predicted. Only XL-form branches set this event. ++event:0x40b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA_BR1 : Conditional Branch Completed on BR1 that had its target address predicted. Only XL-form branches set this event. ++event:0x48b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_PRED_TA_CMPL : IFU ++event:0x40a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND_BR0 : Unconditional Branch Completed on BR0. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was coverted to a Resolve. ++event:0x40a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND_BR1 : Unconditional Branch Completed on BR1. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was coverted to a Resolve. ++event:0x48a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_BR_UNCOND_CMPL : IFU ++event:0x3094 counters:0,1,2,3 um:zero minimum:10000 name:PM_CASTOUT_ISSUED : Castouts issued ++event:0x3096 counters:0,1,2,3 um:zero minimum:10000 name:PM_CASTOUT_ISSUED_GPR : Castouts issued GPR ++event:0x10050 counters:0 um:zero minimum:10000 name:PM_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for all data types ( demand load,data,inst prefetch,inst fetch,xlate (I or d). ++event:0x2090 counters:0,1,2,3 um:zero minimum:10000 name:PM_CLB_HELD : CLB Hold: Any Reason + event:0x4000a counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL : Completion stall. + event:0x4d018 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU : Completion stall due to a Branch Unit. + event:0x2d018 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_BRU_CRU : Completion stall due to IFU. +@@ -30,7 +73,7 @@ event:0x4d014 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_LOAD_FINISH : + event:0x2c010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_LSU : Completion stall by LSU instruction. + event:0x10036 counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_LWSYNC : completion stall due to isync/lwsync. + event:0x30028 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_MEM_ECC_DELAY : Completion stall due to mem ECC delay. +-event:0x2e01c counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NO_NTF : Completion stall due to nop ++event:0x2e01c counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NO_NTF : Completion stall due to nop. + event:0x2e01e counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_NTCG_FLUSH : Completion stall due to reject (load hit store). + event:0x30006 counters:2 um:zero minimum:10000 name:PM_CMPLU_STALL_OTHER_CMPL : Instructions core completed while this thread was stalled. + event:0x4c010 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_REJECT : Completion stall due to LSU reject. +@@ -41,59 +84,937 @@ event:0x2d010 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_SCALAR_LONG : + event:0x2c014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_STORE : Completion stall by stores. + event:0x4c01c counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_ST_FWD : Completion stall due to store forward. + event:0x1001c counters:0 um:zero minimum:10000 name:PM_CMPLU_STALL_THRD : Completion stall due to thread conflict. +-event:0x2d014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR : Completion stall due to VSU vector instruction. ++event:0x2d014 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR : Completion stall due to VSU vector instruction. + event:0x4d012 counters:3 um:zero minimum:10000 name:PM_CMPLU_STALL_VECTOR_LONG : Completion stall due to VSU vector long instruction. + event:0x2d012 counters:1 um:zero minimum:10000 name:PM_CMPLU_STALL_VSU : Completion stall due to VSU instruction. +-event:0x1c042 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a demand load or demand load plus prefetch controlled by MMCR1[20]. +-event:0x1c040 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a demand load or demand load plus prefetch controlled by MMCR1[20] . +-event:0x4c042 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a demand load. +-event:0x4c04e counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to a demand load. +-event:0x1c044 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a demand load or demand load plus prefetch controlled by MMCR1[20]. +-event:0x2c048 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a demand load. +-event:0x2c04c counters:1 um:zero minimum:10000 name:PM_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a demand load. +-event:0x3e050 counters:2 um:zero minimum:10000 name:PM_DC_PREF_STREAM_STRIDED_CONF : A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.. +-event:0x4d01e counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED : Gct empty fo this thread due to branch mispred. +-event:0x4d01a counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED_ICMISS : Gct empty fo this thread due to Icache Miss and branch mispred. +-event:0x2d01e counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_ISSQ : Gct empty fo this thread due to dispatch hold on this thread due to Issue q full. +-event:0x4d01c counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_MAP : Gct empty fo this thread due to dispatch hold on this thread due to Mapper full. +-event:0x2e010 counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_OTHER : Gct empty fo this thread due to dispatch hold on this thread due to sync. +-event:0x2d01c counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_SRQ : Gct empty fo this thread due to dispatch hold on this thread due to SRQ full. +-event:0x4e010 counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_L3MISS : Gct empty fo this thread due to icach l3 miss. +-event:0x2d01a counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_MISS : Gct empty fo this thread due to Icache Miss. ++event:0x16083 counters:0 um:zero minimum:10000 name:PM_CO0_ALLOC : 0.0 ++event:0x16082 counters:0 um:zero minimum:10000 name:PM_CO0_BUSY : CO mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) ++event:0x517082 counters:0 um:zero minimum:10000 name:PM_CO_DISP_FAIL : CO dispatch failed due to all CO machines being busy ++event:0x527084 counters:1 um:zero minimum:10000 name:PM_CO_TM_SC_FOOTPRINT : L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) ++event:0x3608a counters:2 um:zero minimum:10000 name:PM_CO_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x40066 counters:3 um:zero minimum:10000 name:PM_CRU_FIN : IFU Finished a (non-branch) instruction. ++event:0x61c050 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load ++event:0x64c048 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c048 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04c counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04c counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c042 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c046 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c046 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04e counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c040 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c040 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c040 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c040 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c042 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c044 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c044 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c044 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c046 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04e counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c042 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c042 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c044 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04c counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c048 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04c counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x64c04a counters:3 um:zero minimum:10000 name:PM_DATA_ALL_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c048 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c046 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x61c04a counters:0 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c04a counters:1 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x63c04a counters:2 um:zero minimum:10000 name:PM_DATA_ALL_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 ++event:0x62c050 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load ++event:0x62c052 counters:1 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x61c052 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load ++event:0x61c054 counters:0 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load ++event:0x64c052 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load ++event:0x63c050 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load ++event:0x63c052 counters:2 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x64c050 counters:3 um:zero minimum:10000 name:PM_DATA_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load ++event:0x1c050 counters:0 um:zero minimum:10000 name:PM_DATA_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for a demand load. ++event:0x4c048 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c048 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c04c counters:2 um:zero minimum:10000 name:PM_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c04c counters:3 um:zero minimum:10000 name:PM_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c042 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c046 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c046 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c04e counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L2 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c040 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c040 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c040 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c040 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1 . ++event:0x4c042 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c044 counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c044 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c044 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c046 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c04e counters:3 um:zero minimum:10000 name:PM_DATA_FROM_L3MISS_MOD : The processor's data cache was reloaded from a localtion other than the local core's L3 due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c042 counters:2 um:zero minimum:10000 name:PM_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c042 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c044 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c04c counters:0 um:zero minimum:10000 name:PM_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c048 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c04c counters:1 um:zero minimum:10000 name:PM_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x4c04a counters:3 um:zero minimum:10000 name:PM_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c048 counters:0 um:zero minimum:10000 name:PM_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c046 counters:1 um:zero minimum:10000 name:PM_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x1c04a counters:0 um:zero minimum:10000 name:PM_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c04a counters:1 um:zero minimum:10000 name:PM_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x3c04a counters:2 um:zero minimum:10000 name:PM_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either only demand loads or demand loads plus prefetches if MMCR1[16] is 1. ++event:0x2c050 counters:1 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for a demand load. ++event:0x2c052 counters:1 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x1c052 counters:0 um:zero minimum:10000 name:PM_DATA_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor a demand load. ++event:0x1c054 counters:0 um:zero minimum:10000 name:PM_DATA_PUMP_CPRED : Pump prediction correct. Counts across all types of pumps for a demand load. ++event:0x4c052 counters:3 um:zero minimum:10000 name:PM_DATA_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor a demand load. ++event:0x3c050 counters:2 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for a demand load. ++event:0x3c052 counters:2 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x4c050 counters:3 um:zero minimum:10000 name:PM_DATA_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for a demand load. ++event:0x3001a counters:2 um:zero minimum:10000 name:PM_DATA_TABLEWALK_CYC : Data Tablewalk Active. ++event:0xe0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_DC_COLLISIONS : DATA Cache collisions42 ++event:0x1e050 counters:0 um:zero minimum:10000 name:PM_DC_PREF_STREAM_ALLOC : Stream marked valid. The stream could have been allocated through the hardware prefetch mechanism or through software. This is combined ls0 and ls1. ++event:0x2e050 counters:1 um:zero minimum:10000 name:PM_DC_PREF_STREAM_CONF : A demand load referenced a line in an active prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software. Combine up + down. ++event:0x4e050 counters:3 um:zero minimum:10000 name:PM_DC_PREF_STREAM_FUZZY_CONF : A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up). ++event:0x3e050 counters:2 um:zero minimum:10000 name:PM_DC_PREF_STREAM_STRIDED_CONF : A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.. ++event:0x4c054 counters:3 um:zero minimum:10000 name:PM_DERAT_MISS_16G : Data ERAT Miss (Data TLB Access) page size 16G. ++event:0x3c054 counters:2 um:zero minimum:10000 name:PM_DERAT_MISS_16M : Data ERAT Miss (Data TLB Access) page size 16M. ++event:0x1c056 counters:0 um:zero minimum:10000 name:PM_DERAT_MISS_4K : Data ERAT Miss (Data TLB Access) page size 4K. ++event:0x2c054 counters:1 um:zero minimum:10000 name:PM_DERAT_MISS_64K : Data ERAT Miss (Data TLB Access) page size 64K. ++event:0xb0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU : Finish DFU (all finish) ++event:0xb0be counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU_DCFFIX : Convert from fixed opcode finish (dcffix,dcffixq) ++event:0xb0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU_DENBCD : BCD->DPD opcode finish (denbcd, denbcdq) ++event:0xb0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DFU_MC : Finish DFU multicycle ++event:0x2092 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_BAL : Dispatch/CLB Hold: Balance ++event:0x2094 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_RES : Dispatch/CLB Hold: Resource ++event:0x20a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_SB : Dispatch/CLB Hold: Scoreboard ++event:0x2098 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_SYNC : Dispatch/CLB Hold: Sync type instruction ++event:0x2096 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_CLB_HELD_TLBIE : Dispatch Hold: Due to TLBIE ++event:0x10006 counters:0 um:zero minimum:10000 name:PM_DISP_HELD : Dispatch Held. ++event:0x20006 counters:1 um:zero minimum:10000 name:PM_DISP_HELD_IQ_FULL : Dispatch held due to Issue q full. ++event:0x1002a counters:0 um:zero minimum:10000 name:PM_DISP_HELD_MAP_FULL : Dispatch held due to Mapper full. ++event:0x30018 counters:2 um:zero minimum:10000 name:PM_DISP_HELD_SRQ_FULL : Dispatch held due SRQ no room. ++event:0x4003c counters:3 um:zero minimum:10000 name:PM_DISP_HELD_SYNC_HOLD : Dispatch held due to SYNC hold. ++event:0x30a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_DISP_HOLD_GCT_FULL : Dispatch Hold Due to no space in the GCT ++event:0x30008 counters:2 um:zero minimum:10000 name:PM_DISP_WT : Dispatched Starved (not held, nothing to dispatch). ++event:0x4e048 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. ++event:0x3e048 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. ++event:0x3e04c counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a data side request. ++event:0x4e04c counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a data side request. ++event:0x1e042 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a data side request. ++event:0x4e046 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a data side request. ++event:0x3e046 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a data side request. ++event:0x1e04e counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a data side request. ++event:0x3e040 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a data side request. ++event:0x4e040 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a data side request. ++event:0x2e040 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a data side request. ++event:0x1e040 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a data side request. ++event:0x4e042 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a data side request. ++event:0x4e044 counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a data side request. ++event:0x3e044 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a data side request. ++event:0x2e044 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a data side request. ++event:0x1e046 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a data side request. ++event:0x4e04e counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a data side request. ++event:0x3e042 counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a data side request. ++event:0x2e042 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a data side request. ++event:0x1e044 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a data side request. ++event:0x1e04c counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a data side request. ++event:0x2e048 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a data side request. ++event:0x2e04c counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a data side request. ++event:0x4e04a counters:3 um:zero minimum:10000 name:PM_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a data side request. ++event:0x1e048 counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a data side request. ++event:0x2e046 counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. ++event:0x1e04a counters:0 um:zero minimum:10000 name:PM_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. ++event:0x2e04a counters:1 um:zero minimum:10000 name:PM_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a data side request. ++event:0x3e04a counters:2 um:zero minimum:10000 name:PM_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a data side request. ++event:0xd094 counters:0,1,2,3 um:zero minimum:10000 name:PM_DSLB_MISS : Data SLB Miss - Total of all segment sizesData SLB misses ++event:0x1c058 counters:0 um:zero minimum:10000 name:PM_DTLB_MISS_16G : Data TLB Miss page size 16G. ++event:0x4c056 counters:3 um:zero minimum:10000 name:PM_DTLB_MISS_16M : Data TLB Miss page size 16M. ++event:0x2c056 counters:1 um:zero minimum:10000 name:PM_DTLB_MISS_4K : Data TLB Miss page size 4k. ++event:0x3c056 counters:2 um:zero minimum:10000 name:PM_DTLB_MISS_64K : Data TLB Miss page size 64K. ++event:0x50a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_EAT_FORCE_MISPRED : XL-form branch was mispredicted due to the predicted target address missing from EAT. The EAT forces a mispredict in this case since there is no predicated target to validate. This is a rare case that may occur when the EAT is full and a branch is ++event:0x4084 counters:0,1,2,3 um:zero minimum:10000 name:PM_EAT_FULL_CYC : Cycles No room in EATSet on bank conflict and case where no ibuffers available. ++event:0x2080 counters:0,1,2,3 um:zero minimum:10000 name:PM_EE_OFF_EXT_INT : Ee off and external interrupt ++event:0x20b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FAV_TBEGIN : Dispatch time Favored tbegin ++event:0xa0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_FLOP_SUM_SCALAR : flops summary scalar instructions ++event:0xa0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_FLOP_SUM_VEC : flops summary vector instructions ++event:0x2084 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_BR_MPRED : Flush caused by branch mispredict ++event:0x30012 counters:2 um:zero minimum:10000 name:PM_FLUSH_COMPLETION : Completion Flush. ++event:0x2082 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP : Dispatch flush ++event:0x208c counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_SB : Dispatch Flush: Scoreboard ++event:0x2088 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_SYNC : Dispatch Flush: Sync ++event:0x208a counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_DISP_TLBIE : Dispatch Flush: TLBIE ++event:0x208e counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_LSU : Flush initiated by LSU ++event:0x2086 counters:0,1,2,3 um:zero minimum:10000 name:PM_FLUSH_PARTIAL : Partial flush ++event:0xa0b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU0_FCONV : Convert instruction executed ++event:0xa0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU0_FEST : Estimate instruction executed ++event:0xa0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU0_FRSP : Round to single precision instruction executed ++event:0xa0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU1_FCONV : Convert instruction executed ++event:0xa0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU1_FEST : Estimate instruction executed ++event:0xa0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_FPU1_FRSP : Round to single precision instruction executed ++event:0x3000c counters:2 um:zero minimum:10000 name:PM_FREQ_DOWN : Frequency is being slewed down due to Power Management. ++event:0x4000c counters:3 um:zero minimum:10000 name:PM_FREQ_UP : Frequency is being slewed up due to Power Management. ++event:0x50b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP0_1 : One pair of instructions fused with TOC in Group0 ++event:0x50ae counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP0_2 : Two pairs of instructions fused with TOCin Group0 ++event:0x50ac counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP0_3 : Three pairs of instructions fused with TOC in Group0 ++event:0x50b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_TOC_GRP1_1 : One pair of instructions fused with TOX in Group1 ++event:0x50b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP0_1 : One pair of instructions fused with VSX in Group0 ++event:0x50b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP0_2 : Two pairs of instructions fused with VSX in Group0 ++event:0x50b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP0_3 : Three pairs of instructions fused with VSX in Group0 ++event:0x50ba counters:0,1,2,3 um:zero minimum:10000 name:PM_FUSION_VSX_GRP1_1 : One pair of instructions fused with VSX in Group1 ++event:0x3000e counters:2 um:zero minimum:10000 name:PM_FXU0_BUSY_FXU1_IDLE : fxu0 busy and fxu1 idle. ++event:0x10004 counters:0 um:zero minimum:10000 name:PM_FXU0_FIN : FXU0 Finished. ++event:0x4000e counters:3 um:zero minimum:10000 name:PM_FXU1_BUSY_FXU0_IDLE : fxu0 idle and fxu1 busy. . ++event:0x40004 counters:3 um:zero minimum:10000 name:PM_FXU1_FIN : FXU1 Finished. ++event:0x2000e counters:1 um:zero minimum:10000 name:PM_FXU_BUSY : fxu0 busy and fxu1 busy.. ++event:0x1000e counters:0 um:zero minimum:10000 name:PM_FXU_IDLE : fxu0 idle and fxu1 idle. ++event:0x20008 counters:1 um:zero minimum:10000 name:PM_GCT_EMPTY_CYC : No itags assigned either thread (GCT Empty). ++event:0x30a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_MERGE : Group dispatched on a merged GCT empty. GCT entries can be merged only within the same thread ++event:0x4d01e counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED : Gct empty for this thread due to branch mispred. ++event:0x4d01a counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_BR_MPRED_ICMISS : Gct empty for this thread due to Icache Miss and branch mispred. ++event:0x2d01e counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_ISSQ : Gct empty for this thread due to dispatch hold on this thread due to Issue q full. ++event:0x4d01c counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_MAP : Gct empty for this thread due to dispatch hold on this thread due to Mapper full. ++event:0x2e010 counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_OTHER : Gct empty for this thread due to dispatch hold on this thread due to sync. ++event:0x2d01c counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_DISP_HELD_SRQ : Gct empty for this thread due to dispatch hold on this thread due to SRQ full. ++event:0x4e010 counters:3 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_L3MISS : Gct empty for this thread due to icach l3 miss. ++event:0x2d01a counters:1 um:zero minimum:10000 name:PM_GCT_NOSLOT_IC_MISS : Gct empty for this thread due to Icache Miss. ++event:0x20a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_11_14_ENTRIES : GCT Utilization 11-14 entries ++event:0x20a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_15_17_ENTRIES : GCT Utilization 15-17 entries ++event:0x20a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_18_ENTRIES : GCT Utilization 18+ entries ++event:0x209c counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_1_2_ENTRIES : GCT Utilization 1-2 entries ++event:0x209e counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_3_6_ENTRIES : GCT Utilization 3-6 entries ++event:0x20a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_GCT_UTIL_7_10_ENTRIES : GCT Utilization 7-10 entries ++event:0x1000a counters:0 um:zero minimum:10000 name:PM_GRP_BR_MPRED_NONSPEC : Group experienced Non-speculative br mispredicct. ++event:0x30004 counters:2 um:zero minimum:100000 name:PM_GRP_CMPL : group completed. + event:0x3000a counters:2 um:zero minimum:100000 name:PM_GRP_DISP : dispatch_success (Group Dispatched). ++event:0x1000c counters:0 um:zero minimum:10000 name:PM_GRP_IC_MISS_NONSPEC : Group experi enced Non-specu lative I cache miss. + event:0x10130 counters:0 um:zero minimum:10000 name:PM_GRP_MRK : Instruction marked in idu. ++event:0x509c counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_NON_FULL_GROUP : GROUPs where we did not have 6 non branch instructions in the group(ST mode), in SMT mode 3 non branches ++event:0x20050 counters:1 um:zero minimum:10000 name:PM_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x20052 counters:1 um:zero minimum:10000 name:PM_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x10052 counters:0 um:zero minimum:10000 name:PM_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x50a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_2ND_BRANCH : There were enough instructions in the Ibuffer, but 2nd branch ends group ++event:0x50a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_FPU_AFTER_BR : There were enough instructions in the Ibuffer, but FPU OP IN same group after a branch terminates a group, cant do partial flushes ++event:0x509e counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_NOINST : Do not fill every slot in the group, Not enough instructions in the Ibuffer. This includes cases where the group started with enough instructions, but some got knocked out by a cache miss or branch redirect (which would also empty the Ibuffer). ++event:0x50a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_OTHER : There were enough instructions in the Ibuffer, but the group terminated early for some other reason, most likely due to a First or Last. ++event:0x50a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_GRP_TERM_SLOT_LIMIT : There were enough instructions in the Ibuffer, but 3 src RA/RB/RC , 2 way crack caused a group termination + event:0x2000a counters:1 um:zero minimum:10000 name:PM_HV_CYC : cycles in hypervisor mode . ++event:0x4086 counters:0,1,2,3 um:zero minimum:10000 name:PM_IBUF_FULL_CYC : Cycles No room in ibufffully qualified tranfer (if5 valid). ++event:0x10018 counters:0 um:zero minimum:10000 name:PM_IC_DEMAND_CYC : Demand ifetch pending. ++event:0x4098 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_L2_BHT_REDIRECT : L2 I cache demand request due to BHT redirect, branch redirect ( 2 bubbles 3 cycles) ++event:0x409a counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_L2_BR_REDIRECT : L2 I cache demand request due to branch Mispredict ( 15 cycle path) ++event:0x4088 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_DEMAND_REQ : Demand Instruction fetch request ++event:0x508a counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_INVALIDATE : Ic line invalidated ++event:0x4092 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_HIT : Prefetch Canceled due to icache hit ++event:0x4094 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_L2 : L2 Squashed request ++event:0x4090 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_CANCEL_PAGE : Prefetch Canceled due to page boundary ++event:0x408a counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_REQ : Instruction prefetch requests ++event:0x408e counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_PREF_WRITE : Instruction prefetch written into IL1 ++event:0x4096 counters:0,1,2,3 um:zero minimum:10000 name:PM_IC_RELOAD_PRIVATE : Reloading line was brought in private for a specific thread. Most lines are brought in shared for all eight thrreads. If RA does not match then invalidates and then brings it shared to other thread. In P7 line brought in private , then line was inv ++event:0x4006a counters:3 um:zero minimum:10000 name:PM_IERAT_RELOAD_16M : IERAT Reloaded (Miss) for a 16M page. ++event:0x20064 counters:1 um:zero minimum:10000 name:PM_IERAT_RELOAD_4K : IERAT Reloaded (Miss) for a 4k page. ++event:0x3006a counters:2 um:zero minimum:10000 name:PM_IERAT_RELOAD_64K : IERAT Reloaded (Miss) for a 64k page. ++event:0x3405e counters:2 um:zero minimum:10000 name:PM_IFETCH_THROTTLE : Cycles instruction fecth was throttled in IFU. ++event:0x5088 counters:0,1,2,3 um:zero minimum:10000 name:PM_IFU_L2_TOUCH : L2 touch to update MRU on a line ++event:0x514050 counters:0 um:zero minimum:10000 name:PM_INST_ALL_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for an instruction fetch ++event:0x544048 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_DL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534048 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_DL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x53404c counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_DL4 : The processor's Instruction cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x54404c counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_DMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514042 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2 : The processor's Instruction cache was reloaded from local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544046 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L21_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534046 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L21_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x51404e counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2MISS : The processor's Instruction cache was reloaded from a localtion other than the local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534040 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_DISP_CONFLICT_LDHITST : The processor's Instruction cache was reloaded from local core's L2 with load hit store conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544040 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_DISP_CONFLICT_OTHER : The processor's Instruction cache was reloaded from local core's L2 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524040 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_MEPF : The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514040 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L2_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L2 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544042 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3 : The processor's Instruction cache was reloaded from local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x544044 counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_ECO_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534044 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_ECO_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524044 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514046 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L31_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x54404e counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3MISS_MOD : The processor's Instruction cache was reloaded from a localtion other than the local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x534042 counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3_DISP_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524042 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3_MEPF : The processor's Instruction cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514044 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_L3_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x51404c counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_LL4 : The processor's Instruction cache was reloaded from the local chip's L4 cache due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524048 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_LMEM : The processor's Instruction cache was reloaded from the local chip's Memory due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x52404c counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_MEMORY : The processor's Instruction cache was reloaded from a memory location including L4 from local remote or distant due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x54404a counters:3 um:zero minimum:10000 name:PM_INST_ALL_FROM_OFF_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x514048 counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_ON_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524046 counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_RL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x51404a counters:0 um:zero minimum:10000 name:PM_INST_ALL_FROM_RL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x52404a counters:1 um:zero minimum:10000 name:PM_INST_ALL_FROM_RL4 : The processor's Instruction cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x53404a counters:2 um:zero minimum:10000 name:PM_INST_ALL_FROM_RMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 ++event:0x524050 counters:1 um:zero minimum:10000 name:PM_INST_ALL_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for an instruction fetch ++event:0x524052 counters:1 um:zero minimum:10000 name:PM_INST_ALL_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x514052 counters:0 um:zero minimum:10000 name:PM_INST_ALL_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor an instruction fetch ++event:0x514054 counters:0 um:zero minimum:10000 name:PM_INST_ALL_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor an instruction fetch ++event:0x544052 counters:3 um:zero minimum:10000 name:PM_INST_ALL_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor an instruction fetch ++event:0x534050 counters:2 um:zero minimum:10000 name:PM_INST_ALL_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for an instruction fetch ++event:0x534052 counters:2 um:zero minimum:10000 name:PM_INST_ALL_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x544050 counters:3 um:zero minimum:10000 name:PM_INST_ALL_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for an instruction fetch ++event:0x14050 counters:0 um:zero minimum:10000 name:PM_INST_CHIP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was chip pump (prediction=correct) for an instruction fetch. + event:0x2 counters:0,1,2,3 um:zero minimum:100000 name:PM_INST_CMPL : PPC Instructions Finished (completed). ++event:0x44048 counters:3 um:zero minimum:10000 name:PM_INST_FROM_DL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34048 counters:2 um:zero minimum:10000 name:PM_INST_FROM_DL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x3404c counters:2 um:zero minimum:10000 name:PM_INST_FROM_DL4 : The processor's Instruction cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4404c counters:3 um:zero minimum:10000 name:PM_INST_FROM_DMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group (Distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4080 counters:0,1,2,3 um:zero minimum:10000 name:PM_INST_FROM_L1 : Instruction fetches from L1 ++event:0x14042 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2 : The processor's Instruction cache was reloaded from local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44046 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L21_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34046 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L21_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L2 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x1404e counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2MISS : The processor's Instruction cache was reloaded from a localtion other than the local core's L2 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34040 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L2_DISP_CONFLICT_LDHITST : The processor's Instruction cache was reloaded from local core's L2 with load hit store conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44040 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L2_DISP_CONFLICT_OTHER : The processor's Instruction cache was reloaded from local core's L2 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24040 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L2_MEPF : The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14040 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L2_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L2 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44042 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L3 : The processor's Instruction cache was reloaded from local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x44044 counters:3 um:zero minimum:10000 name:PM_INST_FROM_L31_ECO_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34044 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L31_ECO_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24044 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L31_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14046 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L31_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another core's L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4404e counters:3 um:zero minimum:10000 name:PM_INST_FROM_L3MISS_MOD : The processor's Instruction cache was reloaded from a localtion other than the local core's L3 due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x34042 counters:2 um:zero minimum:10000 name:PM_INST_FROM_L3_DISP_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 with dispatch conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24042 counters:1 um:zero minimum:10000 name:PM_INST_FROM_L3_MEPF : The processor's Instruction cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14044 counters:0 um:zero minimum:10000 name:PM_INST_FROM_L3_NO_CONFLICT : The processor's Instruction cache was reloaded from local core's L3 without conflict due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x1404c counters:0 um:zero minimum:10000 name:PM_INST_FROM_LL4 : The processor's Instruction cache was reloaded from the local chip's L4 cache due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24048 counters:1 um:zero minimum:10000 name:PM_INST_FROM_LMEM : The processor's Instruction cache was reloaded from the local chip's Memory due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x2404c counters:1 um:zero minimum:10000 name:PM_INST_FROM_MEMORY : The processor's Instruction cache was reloaded from a memory location including L4 from local remote or distant due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x4404a counters:3 um:zero minimum:10000 name:PM_INST_FROM_OFF_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x14048 counters:0 um:zero minimum:10000 name:PM_INST_FROM_ON_CHIP_CACHE : The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24046 counters:1 um:zero minimum:10000 name:PM_INST_FROM_RL2L3_MOD : The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x1404a counters:0 um:zero minimum:10000 name:PM_INST_FROM_RL2L3_SHR : The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x2404a counters:1 um:zero minimum:10000 name:PM_INST_FROM_RL4 : The processor's Instruction cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x3404a counters:2 um:zero minimum:10000 name:PM_INST_FROM_RMEM : The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to either an instruction fetch or instruction fetch plus prefetch if MMCR1[17] is 1 . ++event:0x24050 counters:1 um:zero minimum:10000 name:PM_INST_GRP_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was group pump for an instruction fetch. ++event:0x24052 counters:1 um:zero minimum:10000 name:PM_INST_GRP_PUMP_MPRED : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope OR Final Pump Scope(Group) got data from source that was at smaller scope(Chip) Final pump was group pump and initial pump was chip or final and initial pump was gro ++event:0x14052 counters:0 um:zero minimum:10000 name:PM_INST_GRP_PUMP_MPRED_RTY : Final Pump Scope(Group) to get data sourced, ended up larger than Initial Pump Scope (Chip) Final pump was group pump and initial pump was chip pumpfor an instruction fetch. ++event:0x1003a counters:0 um:zero minimum:10000 name:PM_INST_IMC_MATCH_CMPL : IMC Match Count. ++event:0x30016 counters:2 um:zero minimum:10000 name:PM_INST_IMC_MATCH_DISP : IMC Matches dispatched. ++event:0x14054 counters:0 um:zero minimum:10000 name:PM_INST_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor an instruction fetch. ++event:0x44052 counters:3 um:zero minimum:10000 name:PM_INST_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor an instruction fetch. ++event:0x34050 counters:2 um:zero minimum:10000 name:PM_INST_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for an instruction fetch. ++event:0x34052 counters:2 um:zero minimum:10000 name:PM_INST_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x44050 counters:3 um:zero minimum:10000 name:PM_INST_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for an instruction fetch. + event:0x10014 counters:0 um:zero minimum:100000 name:PM_IOPS_CMPL : IOPS Completed. ++event:0x30014 counters:2 um:zero minimum:100000 name:PM_IOPS_DISP : IOPS dispatched. ++event:0x45048 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request. ++event:0x35048 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request. ++event:0x3504c counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a instruction side request. ++event:0x4504c counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a instruction side request. ++event:0x15042 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a instruction side request. ++event:0x45046 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a instruction side request. ++event:0x35046 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a instruction side request. ++event:0x1504e counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a instruction side request. ++event:0x35040 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a instruction side request. ++event:0x45040 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a instruction side request. ++event:0x25040 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a instruction side request. ++event:0x15040 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a instruction side request. ++event:0x45042 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a instruction side request. ++event:0x45044 counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request. ++event:0x35044 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a instruction side request. ++event:0x25044 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a instruction side request. ++event:0x15046 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a instruction side request. ++event:0x4504e counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a instruction side request. ++event:0x35042 counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a instruction side request. ++event:0x25042 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a instruction side request. ++event:0x15044 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a instruction side request. ++event:0x1504c counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a instruction side request. ++event:0x25048 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request. ++event:0x2504c counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a instruction side request. ++event:0x4504a counters:3 um:zero minimum:10000 name:PM_IPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a instruction side request. ++event:0x15048 counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a instruction side request. ++event:0x25046 counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request. ++event:0x1504a counters:0 um:zero minimum:10000 name:PM_IPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request. ++event:0x2504a counters:1 um:zero minimum:10000 name:PM_IPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a instruction side request. ++event:0x3504a counters:2 um:zero minimum:10000 name:PM_IPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a instruction side request. ++event:0x617082 counters:0 um:zero minimum:10000 name:PM_ISIDE_DISP : All i-side dispatch attempts ++event:0x627084 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL : All i-side dispatch attempts that failed due to a addr collision with another machine ++event:0x627086 counters:1 um:zero minimum:10000 name:PM_ISIDE_DISP_FAIL_OTHER : All i-side dispatch attempts that failed due to a reason other than addrs collision ++event:0x4608e counters:3 um:zero minimum:10000 name:PM_ISIDE_L2MEMACC : valid when first beat of data comes in for an i-side fetch where data came from mem(or L4) ++event:0x44608e counters:3 um:zero minimum:10000 name:PM_ISIDE_MRU_TOUCH : Iside L2 MRU touch ++event:0xd096 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISLB_MISS : I SLB Miss. ++event:0x30ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX0 : FX0 ISU reject ++event:0x30ae counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FX1 : FX1 ISU reject ++event:0x38ac counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_FXU : ISU ++event:0x30b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS0 : LS0 ISU reject ++event:0x30b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS1 : LS1 ISU reject ++event:0x30b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS2 : LS2 ISU reject ++event:0x30b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REF_LS3 : LS3 ISU reject ++event:0x309c counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECTS_ALL : All isu rejects could be more than 1 per cycle ++event:0x30a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_RES_NA : ISU reject due to resource not available ++event:0x309e counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_SAR_BYPASS : Reject because of SAR bypass ++event:0x30a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJECT_SRC_NA : ISU reject due to source not available ++event:0x30a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS0 : VS0 ISU reject ++event:0x30aa counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VS1 : VS1 ISU reject ++event:0x38a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISU_REJ_VSU : ISU ++event:0x30b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_ISYNC : Isync count per thread ++event:0x200301ea counters:2 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_2048 : Reload latency exceeded 2048 cyc ++event:0x200101e8 counters:0 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_256 : Reload latency exceeded 256 cyc ++event:0x200201e6 counters:1 um:zero minimum:10000 name:PM_L1MISS_LAT_EXC_32 : Reload latency exceeded 32 cyc ++event:0x26086 counters:1 um:zero minimum:10000 name:PM_L1PF_L2MEMACC : valid when first beat of data comes in for an L1pref where data came from mem(or L4) ++event:0x1002c counters:0 um:zero minimum:10000 name:PM_L1_DCACHE_RELOADED_ALL : L1 data cache reloaded for demand or prefetch . ++event:0x408c counters:0,1,2,3 um:zero minimum:10000 name:PM_L1_DEMAND_WRITE : Instruction Demand sectors wriittent into IL1 ++event:0x40012 counters:3 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_ALL : Counts all Icache reloads includes demand, prefetchm prefetch turned into demand and demand turned into prefetch. ++event:0x30068 counters:2 um:zero minimum:10000 name:PM_L1_ICACHE_RELOADED_PREF : Counts all Icache prefetch reloads ( includes demand turned into prefetch). ++event:0x417080 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_MOD : L2 Castouts - Modified (M, Mu, Me) ++event:0x417082 counters:0 um:zero minimum:10000 name:PM_L2_CASTOUT_SHR : L2 Castouts - Shared (T, Te, Si, S) ++event:0x27084 counters:1 um:zero minimum:10000 name:PM_L2_CHIP_PUMP : RC requests that were local on chip pump attempts ++event:0x427086 counters:1 um:zero minimum:10000 name:PM_L2_DC_INV : Dcache invalidates from L2 ++event:0x44608c counters:3 um:zero minimum:10000 name:PM_L2_DISP_ALL_L2MISS : All successful Ld/St dispatches for this thread that were an L2miss. ++event:0x64608e counters:3 um:zero minimum:10000 name:PM_L2_GROUP_PUMP : RC requests that were on Node Pump attempts ++event:0x626084 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_CORRECT : L2 guess grp and guess was correct (data intra-6chip AND ^on-chip) ++event:0x626086 counters:1 um:zero minimum:10000 name:PM_L2_GRP_GUESS_WRONG : L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip) ++event:0x427084 counters:1 um:zero minimum:10000 name:PM_L2_IC_INV : Icache Invalidates from L2 ++event:0x436088 counters:2 um:zero minimum:10000 name:PM_L2_INST : All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs) ++event:0x43608a counters:2 um:zero minimum:10000 name:PM_L2_INST_MISS : All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs) ++event:0x416080 counters:0 um:zero minimum:10000 name:PM_L2_LD : All successful D-side Load dispatches for this thread ++event:0x437088 counters:2 um:zero minimum:10000 name:PM_L2_LD_DISP : All successful load dispatches ++event:0x43708a counters:2 um:zero minimum:10000 name:PM_L2_LD_HIT : All successful load dispatches that were L2 hits ++event:0x426084 counters:1 um:zero minimum:10000 name:PM_L2_LD_MISS : All successful D-Side Load dispatches that were an L2miss for this thread ++event:0x616080 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_CORRECT : L2 guess loc and guess was correct (ie data local) ++event:0x616082 counters:0 um:zero minimum:10000 name:PM_L2_LOC_GUESS_WRONG : L2 guess loc and guess was not correct (ie data not on chip) ++event:0x516080 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP : L2 RC load dispatch attempt ++event:0x516082 counters:0 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_ADDR : L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ ++event:0x526084 counters:1 um:zero minimum:10000 name:PM_L2_RCLD_DISP_FAIL_OTHER : L2 RC load dispatch attempt failed due to other reasons ++event:0x536088 counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP : L2 RC store dispatch attempt ++event:0x53608a counters:2 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_ADDR : L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ ++event:0x54608c counters:3 um:zero minimum:10000 name:PM_L2_RCST_DISP_FAIL_OTHER : L2 RC store dispatch attempt failed due to other reasons ++event:0x537088 counters:2 um:zero minimum:10000 name:PM_L2_RC_ST_DONE : RC did st to line that was Tx or Sx ++event:0x63708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_LD : RC retries on PB for any load from core ++event:0x3708a counters:2 um:zero minimum:10000 name:PM_L2_RTY_ST : RC retries on PB for any store from core ++event:0x54708c counters:3 um:zero minimum:10000 name:PM_L2_SN_M_RD_DONE : SNP dispatched for a read and was M ++event:0x54708e counters:3 um:zero minimum:10000 name:PM_L2_SN_M_WR_DONE : SNP dispatched for a write and was M ++event:0x53708a counters:2 um:zero minimum:10000 name:PM_L2_SN_SX_I_DONE : SNP dispatched and went from Sx or Tx to Ix ++event:0x17080 counters:0 um:zero minimum:10000 name:PM_L2_ST : All successful D-side store dispatches for this thread ++event:0x44708c counters:3 um:zero minimum:10000 name:PM_L2_ST_DISP : All successful store dispatches ++event:0x44708e counters:3 um:zero minimum:10000 name:PM_L2_ST_HIT : All successful store dispatches that were L2Hits ++event:0x17082 counters:0 um:zero minimum:10000 name:PM_L2_ST_MISS : All successful D-side store dispatches for this thread that were L2 Miss ++event:0x636088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_CORRECT : L2 guess sys and guess was correct (ie data beyond-6chip) ++event:0x63608a counters:2 um:zero minimum:10000 name:PM_L2_SYS_GUESS_WRONG : L2 guess sys and guess was not correct (ie data ^beyond-6chip) ++event:0x37088 counters:2 um:zero minimum:10000 name:PM_L2_SYS_PUMP : RC requests that were system pump attempts ++event:0x1e05e counters:0 um:zero minimum:10000 name:PM_L2_TM_REQ_ABORT : TM abort. ++event:0x3e05c counters:2 um:zero minimum:10000 name:PM_L2_TM_ST_ABORT_SISTER : TM marked store abort. ++event:0x23808a counters:2 um:zero minimum:10000 name:PM_L3_CINJ : l3 ci of cache inject ++event:0x128084 counters:1 um:zero minimum:10000 name:PM_L3_CI_HIT : L3 Castins Hit (total count ++event:0x128086 counters:1 um:zero minimum:10000 name:PM_L3_CI_MISS : L3 castins miss (total count ++event:0x819082 counters:0 um:zero minimum:10000 name:PM_L3_CI_USAGE : rotating sample of 16 CI or CO actives ++event:0x438088 counters:2 um:zero minimum:10000 name:PM_L3_CO : l3 castout occuring ( does not include casthrough or log writes (cinj/dmaw) ++event:0x83908b counters:2 um:zero minimum:10000 name:PM_L3_CO0_ALLOC : 0.0 ++event:0x83908a counters:2 um:zero minimum:10000 name:PM_L3_CO0_BUSY : lifetime, sample of CO machine 0 valid ++event:0x28086 counters:1 um:zero minimum:10000 name:PM_L3_CO_L31 : L3 CO to L3.1 OR of port 0 and 1 ( lossy) ++event:0x238088 counters:2 um:zero minimum:10000 name:PM_L3_CO_LCO : Total L3 castouts occurred on LCO ++event:0x28084 counters:1 um:zero minimum:10000 name:PM_L3_CO_MEM : L3 CO to memory OR of port 0 and 1 ( lossy) ++event:0x18082 counters:0 um:zero minimum:10000 name:PM_L3_CO_MEPF : L3 CO of line in Mep state ( includes casthrough ++event:0xb19082 counters:0 um:zero minimum:10000 name:PM_L3_GRP_GUESS_CORRECT : Initial scope=group and data from same group (near) (pred successful) ++event:0xb3908a counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_HIGH : Initial scope=group but data from local node. Predition too high ++event:0xb39088 counters:2 um:zero minimum:10000 name:PM_L3_GRP_GUESS_WRONG_LOW : Initial scope=group but data from outside group (far or rem). Prediction too Low ++event:0x218080 counters:0 um:zero minimum:10000 name:PM_L3_HIT : L3 Hits ++event:0x138088 counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_HIT : L2 castout hits ++event:0x13808a counters:2 um:zero minimum:10000 name:PM_L3_L2_CO_MISS : L2 castout miss ++event:0x14808c counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_HIT : L3 Lateral Castins Hit ++event:0x14808e counters:3 um:zero minimum:10000 name:PM_L3_LAT_CI_MISS : L3 Lateral Castins Miss ++event:0x228084 counters:1 um:zero minimum:10000 name:PM_L3_LD_HIT : L3 demand LD Hits ++event:0x228086 counters:1 um:zero minimum:10000 name:PM_L3_LD_MISS : L3 demand LD Miss ++event:0x1e052 counters:0 um:zero minimum:10000 name:PM_L3_LD_PREF : L3 Load Prefetches. ++event:0xb19080 counters:0 um:zero minimum:10000 name:PM_L3_LOC_GUESS_CORRECT : initial scope=node/chip and data from local node (local) (pred successful) ++event:0xb29086 counters:1 um:zero minimum:10000 name:PM_L3_LOC_GUESS_WRONG : Initial scope=node but data from out side local node (near or far or rem). Prediction too Low ++event:0x218082 counters:0 um:zero minimum:10000 name:PM_L3_MISS : L3 Misses ++event:0x54808c counters:3 um:zero minimum:10000 name:PM_L3_P0_CO_L31 : l3 CO to L3.1 (lco) port 0 ++event:0x538088 counters:2 um:zero minimum:10000 name:PM_L3_P0_CO_MEM : l3 CO to memory port 0 ++event:0x929084 counters:1 um:zero minimum:10000 name:PM_L3_P0_CO_RTY : L3 CO received retry port 0 ++event:0xa29084 counters:1 um:zero minimum:10000 name:PM_L3_P0_GRP_PUMP : L3 pf sent with grp scope port 0 ++event:0x528084 counters:1 um:zero minimum:10000 name:PM_L3_P0_LCO_DATA : lco sent with data port 0 ++event:0x518080 counters:0 um:zero minimum:10000 name:PM_L3_P0_LCO_NO_DATA : dataless l3 lco sent port 0 ++event:0xa4908c counters:3 um:zero minimum:10000 name:PM_L3_P0_LCO_RTY : L3 LCO received retry port 0 ++event:0xa19080 counters:0 um:zero minimum:10000 name:PM_L3_P0_NODE_PUMP : L3 pf sent with nodal scope port 0 ++event:0x919080 counters:0 um:zero minimum:10000 name:PM_L3_P0_PF_RTY : L3 PF received retry port 0 ++event:0x939088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SN_HIT : L3 snoop hit port 0 ++event:0x118080 counters:0 um:zero minimum:10000 name:PM_L3_P0_SN_INV : Port0 snooper detects someone doing a store to a line thats Sx ++event:0x94908c counters:3 um:zero minimum:10000 name:PM_L3_P0_SN_MISS : L3 snoop miss port 0 ++event:0xa39088 counters:2 um:zero minimum:10000 name:PM_L3_P0_SYS_PUMP : L3 pf sent with sys scope port 0 ++event:0x54808e counters:3 um:zero minimum:10000 name:PM_L3_P1_CO_L31 : l3 CO to L3.1 (lco) port 1 ++event:0x53808a counters:2 um:zero minimum:10000 name:PM_L3_P1_CO_MEM : l3 CO to memory port 1 ++event:0x929086 counters:1 um:zero minimum:10000 name:PM_L3_P1_CO_RTY : L3 CO received retry port 1 ++event:0xa29086 counters:1 um:zero minimum:10000 name:PM_L3_P1_GRP_PUMP : L3 pf sent with grp scope port 1 ++event:0x528086 counters:1 um:zero minimum:10000 name:PM_L3_P1_LCO_DATA : lco sent with data port 1 ++event:0x518082 counters:0 um:zero minimum:10000 name:PM_L3_P1_LCO_NO_DATA : dataless l3 lco sent port 1 ++event:0xa4908e counters:3 um:zero minimum:10000 name:PM_L3_P1_LCO_RTY : L3 LCO received retry port 1 ++event:0xa19082 counters:0 um:zero minimum:10000 name:PM_L3_P1_NODE_PUMP : L3 pf sent with nodal scope port 1 ++event:0x919082 counters:0 um:zero minimum:10000 name:PM_L3_P1_PF_RTY : L3 PF received retry port 1 ++event:0x93908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SN_HIT : L3 snoop hit port 1 ++event:0x118082 counters:0 um:zero minimum:10000 name:PM_L3_P1_SN_INV : Port1 snooper detects someone doing a store to a line thats Sx ++event:0x94908e counters:3 um:zero minimum:10000 name:PM_L3_P1_SN_MISS : L3 snoop miss port 1 ++event:0xa3908a counters:2 um:zero minimum:10000 name:PM_L3_P1_SYS_PUMP : L3 pf sent with sys scope port 1 ++event:0x84908d counters:3 um:zero minimum:10000 name:PM_L3_PF0_ALLOC : 0.0 ++event:0x84908c counters:3 um:zero minimum:10000 name:PM_L3_PF0_BUSY : lifetime, sample of PF machine 0 valid ++event:0x428084 counters:1 um:zero minimum:10000 name:PM_L3_PF_HIT_L3 : l3 pf hit in l3 ++event:0x18080 counters:0 um:zero minimum:10000 name:PM_L3_PF_MISS_L3 : L3 Prefetch missed in L3 ++event:0x3808a counters:2 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_CACHE : L3 Prefetch from Off chip cache ++event:0x4808e counters:3 um:zero minimum:10000 name:PM_L3_PF_OFF_CHIP_MEM : L3 Prefetch from Off chip memory ++event:0x38088 counters:2 um:zero minimum:10000 name:PM_L3_PF_ON_CHIP_CACHE : L3 Prefetch from On chip cache ++event:0x4808c counters:3 um:zero minimum:10000 name:PM_L3_PF_ON_CHIP_MEM : L3 Prefetch from On chip memory ++event:0x829084 counters:1 um:zero minimum:10000 name:PM_L3_PF_USAGE : rotating sample of 32 PF actives ++event:0x4e052 counters:3 um:zero minimum:10000 name:PM_L3_PREF_ALL : Total HW L3 prefetches(Load+store). ++event:0x84908f counters:3 um:zero minimum:10000 name:PM_L3_RD0_ALLOC : 0.0 ++event:0x84908e counters:3 um:zero minimum:10000 name:PM_L3_RD0_BUSY : lifetime, sample of RD machine 0 valid ++event:0x829086 counters:1 um:zero minimum:10000 name:PM_L3_RD_USAGE : rotating sample of 16 RD actives ++event:0x839089 counters:2 um:zero minimum:10000 name:PM_L3_SN0_ALLOC : 0.0 ++event:0x839088 counters:2 um:zero minimum:10000 name:PM_L3_SN0_BUSY : lifetime, sample of snooper machine 0 valid ++event:0x819080 counters:0 um:zero minimum:10000 name:PM_L3_SN_USAGE : rotating sample of 8 snoop valids ++event:0x2e052 counters:1 um:zero minimum:10000 name:PM_L3_ST_PREF : L3 store Prefetches. ++event:0x3e052 counters:2 um:zero minimum:10000 name:PM_L3_SW_PREF : Data stream touchto L3. ++event:0xb29084 counters:1 um:zero minimum:10000 name:PM_L3_SYS_GUESS_CORRECT : Initial scope=system and data from outside group (far or rem)(pred successful) ++event:0xb4908c counters:3 um:zero minimum:10000 name:PM_L3_SYS_GUESS_WRONG : Initial scope=system but data from local or near. Predction too high ++event:0x24808e counters:3 um:zero minimum:10000 name:PM_L3_TRANS_PF : L3 Transient prefetch ++event:0x18081 counters:0 um:zero minimum:10000 name:PM_L3_WI0_ALLOC : 0.0 ++event:0x418080 counters:0 um:zero minimum:10000 name:PM_L3_WI0_BUSY : lifetime, sample of Write Inject machine 0 valid ++event:0x418082 counters:0 um:zero minimum:10000 name:PM_L3_WI_USAGE : rotating sample of 8 WI actives ++event:0x3c058 counters:2 um:zero minimum:10000 name:PM_LARX_FIN : Larx finished . + event:0x1002e counters:0 um:zero minimum:10000 name:PM_LD_CMPL : count of Loads completed. + event:0x10062 counters:0 um:zero minimum:10000 name:PM_LD_L3MISS_PEND_CYC : Cycles L3 miss was pending for this thread. ++event:0x100ee counters:0 um:zero minimum:10000 name:PM_LD_REF_L1 : Load Ref count combined for all units. ++event:0xc080 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU0 : LS0 L1 D cache load references counted at finish, gated by rejectLSU0 L1 D cache load references ++event:0xc082 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU1 : LS1 L1 D cache load references counted at finish, gated by rejectLSU1 L1 D cache load references ++event:0xc094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU2 : LS2 L1 D cache load references counted at finish, gated by reject42 ++event:0xc096 counters:0,1,2,3 um:zero minimum:10000 name:PM_LD_REF_L1_LSU3 : LS3 L1 D cache load references counted at finish, gated by reject42 ++event:0x509a counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_INVALID_PTR : A flush were LS ptr is invalid, results in a pop , A lot of interrupts between push and pops ++event:0x5098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LINK_STACK_WRONG_ADD_PRED : Link stack predicts wrong address, because of link stack design limitation. ++event:0xe080 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_ERAT_MISS_PREF : LS0 Erat miss due to prefetch42 ++event:0xd0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_L1_PREF : LS0 L1 cache data prefetches42 ++event:0xc098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS0_L1_SW_PREF : Software L1 Prefetches, including SW Transient Prefetches42 ++event:0xe082 counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_ERAT_MISS_PREF : LS1 Erat miss due to prefetch42 ++event:0xd0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_L1_PREF : LS1 L1 cache data prefetches42 ++event:0xc09a counters:0,1,2,3 um:zero minimum:10000 name:PM_LS1_L1_SW_PREF : Software L1 Prefetches, including SW Transient Prefetches42 ++event:0xc0b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_LRQ : LS0 Flush: LRQLSU0 LRQ flushes ++event:0xc0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_SRQ : LS0 Flush: SRQLSU0 SRQ lhs flushes ++event:0xc0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_ULD : LS0 Flush: Unaligned LoadLSU0 unaligned load flushes ++event:0xc0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_FLUSH_UST : LS0 Flush: Unaligned StoreLSU0 unaligned store flushes ++event:0xf088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_L1_CAM_CANCEL : ls0 l1 tm cam cancel42 ++event:0x1e056 counters:0 um:zero minimum:10000 name:PM_LSU0_LARX_FIN : . ++event:0xd08c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_LMQ_LHR_MERGE : LS0 Load Merged with another cacheline request42 ++event:0xc08c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_NCLD : LS0 Non-cachable Loads counted at finishLSU0 non-cacheable loads ++event:0xe090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x1e05a counters:0 um:zero minimum:10000 name:PM_LSU0_REJECT : LSU0 reject . ++event:0xc09c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_SRQ_STFWD : LS0 SRQ forwarded data to a loadLSU0 SRQ store forwarded ++event:0xf084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_STORE_REJECT : ls0 store reject42 ++event:0xe0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe098 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU0_TM_L1_MISS : Load tm L1 miss42 ++event:0xc0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_LRQ : LS1 Flush: LRQLSU1 LRQ flushes ++event:0xc0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_SRQ : LS1 Flush: SRQLSU1 SRQ lhs flushes ++event:0xc0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_ULD : LS 1 Flush: Unaligned LoadLSU1 unaligned load flushes ++event:0xc0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_FLUSH_UST : LS1 Flush: Unaligned StoreLSU1 unaligned store flushes ++event:0xf08a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_L1_CAM_CANCEL : ls1 l1 tm cam cancel42 ++event:0x2e056 counters:1 um:zero minimum:10000 name:PM_LSU1_LARX_FIN : Larx finished in LSU pipe1. ++event:0xd08e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_LMQ_LHR_MERGE : LS1 Load Merge with another cacheline request42 ++event:0xc08e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_NCLD : LS1 Non-cachable Loads counted at finishLSU1 non-cacheable loads ++event:0xe092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x2e05a counters:1 um:zero minimum:10000 name:PM_LSU1_REJECT : LSU1 reject . ++event:0xc09e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_SRQ_STFWD : LS1 SRQ forwarded data to a loadLSU1 SRQ store forwarded ++event:0xf086 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_STORE_REJECT : ls1 store reject42 ++event:0xe0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe09a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU1_TM_L1_MISS : Load tm L1 miss42 ++event:0xc0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FLUSH_LRQ : LS02Flush: LRQ42 ++event:0xc0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FLUSH_SRQ : LS2 Flush: SRQ42 ++event:0xc0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_FLUSH_ULD : LS3 Flush: Unaligned Load42 ++event:0xf08c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_L1_CAM_CANCEL : ls2 l1 tm cam cancel42 ++event:0x3e056 counters:2 um:zero minimum:10000 name:PM_LSU2_LARX_FIN : Larx finished in LSU pipe2. ++event:0xc084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LDF : LS2 Scalar Loads42 ++event:0xc088 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LDX : LS0 Vector Loads42 ++event:0xd090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_LMQ_LHR_MERGE : LS0 Load Merged with another cacheline request42 ++event:0xe094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x3e05a counters:2 um:zero minimum:10000 name:PM_LSU2_REJECT : LSU2 reject . ++event:0xc0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_SRQ_STFWD : LS2 SRQ forwarded data to a load42 ++event:0xe0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe09c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU2_TM_L1_MISS : Load tm L1 miss42 ++event:0xc0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FLUSH_LRQ : LS3 Flush: LRQ42 ++event:0xc0be counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FLUSH_SRQ : LS13 Flush: SRQ42 ++event:0xc0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_FLUSH_ULD : LS 14Flush: Unaligned Load42 ++event:0xf08e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_L1_CAM_CANCEL : ls3 l1 tm cam cancel42 ++event:0x4e056 counters:3 um:zero minimum:10000 name:PM_LSU3_LARX_FIN : Larx finished in LSU pipe3. ++event:0xc086 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LDF : LS3 Scalar Loads 42 ++event:0xc08a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LDX : LS1 Vector Loads42 ++event:0xd092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_LMQ_LHR_MERGE : LS1 Load Merge with another cacheline request42 ++event:0xe096 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_PRIMARY_ERAT_HIT : Primary ERAT hit42 ++event:0x4e05a counters:3 um:zero minimum:10000 name:PM_LSU3_REJECT : LSU3 reject . ++event:0xc0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_SRQ_STFWD : LS3 SRQ forwarded data to a load42 ++event:0xe0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TMA_REQ_L2 : addrs only req to L2 only on the first one,Indication that Load footprint is not expanding42 ++event:0xe09e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TM_L1_HIT : Load tm hit in L142 ++event:0xe0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU3_TM_L1_MISS : Load tm L1 miss42 ++event:0xe880 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_ERAT_MISS_PREF : LSU ++event:0x30066 counters:2 um:zero minimum:10000 name:PM_LSU_FIN : LSU Finished an instruction (up to 2 per cycle). ++event:0xc8ac counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FLUSH_UST : LSU ++event:0xd0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_FOUR_TABLEWALK_CYC : Cycles when four tablewalks pending on this thread42 ++event:0x10066 counters:0 um:zero minimum:10000 name:PM_LSU_FX_FIN : LSU Finished a FX operation (up to 2 per cycle. ++event:0xd8b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_L1_PREF : LSU ++event:0xc898 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_L1_SW_PREF : LSU ++event:0xc884 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LDF : LSU ++event:0xc888 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LDX : LSU ++event:0xd0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_FULL_CYC : LMQ fullCycles LMQ full, ++event:0xd0a1 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_S0_ALLOC : 0.0 ++event:0xd0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LMQ_S0_VALID : Slot 0 of LMQ validLMQ slot 0 valid ++event:0x3001c counters:2 um:zero minimum:10000 name:PM_LSU_LMQ_SRQ_EMPTY_ALL_CYC : ALL threads lsu empty (lmq and srq empty). Issue HW016541 ++event:0x2003e counters:1 um:zero minimum:10000 name:PM_LSU_LMQ_SRQ_EMPTY_CYC : LSU empty (lmq and srq empty). ++event:0xd09f counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S0_ALLOC : 0.0 ++event:0xd09e counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S0_VALID : Slot 0 of LRQ validLRQ slot 0 valid ++event:0xf091 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S43_ALLOC : 0.0 ++event:0xf090 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_LRQ_S43_VALID : LRQ slot 43 was busy42 ++event:0x30162 counters:2 um:zero minimum:10000 name:PM_LSU_MRK_DERAT_MISS : DERAT Reloaded (Miss). ++event:0xc88c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_NCLD : LSU ++event:0xc092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_NCST : Non-cachable Stores sent to nest42 ++event:0x10064 counters:0 um:zero minimum:10000 name:PM_LSU_REJECT : LSU Reject (up to 4 per cycle). ++event:0x2e05c counters:1 um:zero minimum:10000 name:PM_LSU_REJECT_ERAT_MISS : LSU Reject due to ERAT (up to 4 per cycles). ++event:0x4e05c counters:3 um:zero minimum:10000 name:PM_LSU_REJECT_LHS : LSU Reject due to LHS (up to 4 per cycle). ++event:0x1e05c counters:0 um:zero minimum:10000 name:PM_LSU_REJECT_LMQ_FULL : LSU reject due to LMQ full ( 4 per cycle). ++event:0xd082 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SET_MPRED : Line already in cache at reload time42 ++event:0x40008 counters:3 um:zero minimum:10000 name:PM_LSU_SRQ_EMPTY_CYC : All threads srq empty. ++event:0x1001a counters:0 um:zero minimum:10000 name:PM_LSU_SRQ_FULL_CYC : SRQ is Full. ++event:0xd09d counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S0_ALLOC : 0.0 ++event:0xd09c counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S0_VALID : Slot 0 of SRQ validSRQ slot 0 valid ++event:0xf093 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S39_ALLOC : 0.0 ++event:0xf092 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_S39_VALID : SRQ slot 39 was busy42 ++event:0xd09b counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_SYNC : 0.0 ++event:0xd09a counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_SRQ_SYNC_CYC : A sync is in the SRQ (edge detect to count)SRQ sync duration ++event:0xf084 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_STORE_REJECT : LSU ++event:0xd0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_LSU_TWO_TABLEWALK_CYC : Cycles when two tablewalks pending on this thread42 ++event:0x5094 counters:0,1,2,3 um:zero minimum:10000 name:PM_LWSYNC : threaded version, IC Misses where we got EA dir hit but no sector valids were on. ICBI took line out ++event:0x209a counters:0,1,2,3 um:zero minimum:10000 name:PM_LWSYNC_HELD : LWSYNC held at dispatch ++event:0x4c058 counters:3 um:zero minimum:10000 name:PM_MEM_CO : Memory castouts from this lpar. ++event:0x10058 counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_IFU : Local Memory above threshold for IFU speculation control. ++event:0x40056 counters:3 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_HIGH : Local memory above threshold for LSU medium. ++event:0x1c05e counters:0 um:zero minimum:10000 name:PM_MEM_LOC_THRESH_LSU_MED : Local memory above theshold for data prefetch. ++event:0x2c058 counters:1 um:zero minimum:10000 name:PM_MEM_PREF : Memory prefetch for this lpar. ++event:0x10056 counters:0 um:zero minimum:10000 name:PM_MEM_READ : Reads from Memory from this lpar (includes data/inst/xlate/l1prefetch/inst prefetch). ++event:0x3c05e counters:2 um:zero minimum:10000 name:PM_MEM_RWITM : Memory rwitm for this lpar. ++event:0x3515e counters:2 um:zero minimum:1000 name:PM_MRK_BACK_BR_CMPL : Marked branch instruction completed with a target address less than current instruction address. ++event:0x2013a counters:1 um:zero minimum:1000 name:PM_MRK_BRU_FIN : bru marked instr finish. ++event:0x1016e counters:0 um:zero minimum:1000 name:PM_MRK_BR_CMPL : Branch Instruction completed. ++event:0x3013a counters:2 um:zero minimum:1000 name:PM_MRK_CRU_FIN : IFU non-branch marked instruction finished. ++event:0x4d148 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x2d128 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x3d148 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x2c128 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load. ++event:0x3d14c counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL4 : The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a marked load. ++event:0x2c12c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DL4_CYC : Duration in cycles to reload from another chip's L4 on a different Node or Group (Distant) due to a marked load. ++event:0x4d14c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a marked load. ++event:0x2d12c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_DMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load. + event:0x1d142 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2 : The processor's data cache was reloaded from local core's L2 due to a marked load. +-event:0x4c12e counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L2 due to a marked load. +-event:0x4c122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load. ++event:0x4d146 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a marked load. ++event:0x2d126 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L2 on the same chip due to a marked load. ++event:0x3d146 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a marked load. ++event:0x2c126 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L21_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load. ++event:0x4c12e counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L2 due to a marked load. ++event:0x4c122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_CYC : Duration in cycles to reload from local core's L2 due to a marked load. ++event:0x3d140 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST : The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load. ++event:0x2c120 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC : Duration in cycles to reload from local core's L2 with load hit store conflict due to a marked load. ++event:0x4d140 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER : The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a marked load. ++event:0x2d120 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC : Duration in cycles to reload from local core's L2 with dispatch conflict due to a marked load. ++event:0x2d140 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_MEPF : The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. ++event:0x4d120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_MEPF_CYC : Duration in cycles to reload from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load. + event:0x1d140 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT : The processor's data cache was reloaded from local core's L2 without conflict due to a marked load. +-event:0x4c120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load. ++event:0x4c120 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L2 without conflict due to a marked load. + event:0x4d142 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3 : The processor's data cache was reloaded from local core's L3 due to a marked load. +-event:0x2d12e counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L3 due to a marked load. +-event:0x2d122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load. ++event:0x4d144 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_MOD : The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2d124 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x3d144 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_SHR : The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2c124 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_ECO_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's ECO L3 on the same chip due to a marked load. ++event:0x2d144 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_MOD : The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a marked load. ++event:0x4d124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_MOD_CYC : Duration in cycles to reload with Modified (M) data from another core's L3 on the same chip due to a marked load. ++event:0x1d146 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_SHR : The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a marked load. ++event:0x4c126 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L31_SHR_CYC : Duration in cycles to reload with Shared (S) data from another core's L3 on the same chip due to a marked load. ++event:0x2d12e counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3MISS_CYC : Duration in cycles to reload from a localtion other than the local core's L3 due to a marked load. ++event:0x2d122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_CYC : Duration in cycles to reload from local core's L3 due to a marked load. ++event:0x3d142 counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT : The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load. ++event:0x2c122 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC : Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load. ++event:0x2d142 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_MEPF : The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. ++event:0x4d122 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_MEPF_CYC : Duration in cycles to reload from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load. + event:0x1d144 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT : The processor's data cache was reloaded from local core's L3 without conflict due to a marked load. +-event:0x4c124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load. +-event:0x1d14c counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load. +-event:0x4c12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load. +-event:0x2d148 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load. +-event:0x4d128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load. ++event:0x4c124 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC : Duration in cycles to reload from local core's L3 without conflict due to a marked load. ++event:0x1d14c counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4 : The processor's data cache was reloaded from the local chip's L4 cache due to a marked load. ++event:0x4c12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LL4_CYC : Duration in cycles to reload from the local chip's L4 cache due to a marked load. ++event:0x2d148 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM : The processor's data cache was reloaded from the local chip's Memory due to a marked load. ++event:0x4d128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_LMEM_CYC : Duration in cycles to reload from the local chip's Memory due to a marked load. + event:0x2d14c counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY : The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load. +-event:0x4d12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load. ++event:0x4d12c counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_MEMORY_CYC : Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load. ++event:0x4d14a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. ++event:0x2d12a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load. ++event:0x1d148 counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE : The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a marked load. ++event:0x4c128 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC : Duration in cycles to reload either shared or modified data from another core's L2/L3 on the same chip due to a marked load. ++event:0x2d146 counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_MOD : The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x4d126 counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_MOD_CYC : Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x1d14a counters:0 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_SHR : The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x4c12a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL2L3_SHR_CYC : Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load. ++event:0x2d14a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL4 : The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a marked load. ++event:0x4d12a counters:3 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RL4_CYC : Duration in cycles to reload from another chip's L4 on the same Node or Group ( Remote) due to a marked load. ++event:0x3d14a counters:2 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RMEM : The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a marked load. ++event:0x2c12a counters:1 um:zero minimum:1000 name:PM_MRK_DATA_FROM_RMEM_CYC : Duration in cycles to reload from another chip's memory on the same Node or Group ( Remote) due to a marked load. ++event:0x40118 counters:3 um:zero minimum:1000 name:PM_MRK_DCACHE_RELOAD_INTV : Combined Intervention event. ++event:0x4d154 counters:3 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_16G : Marked Data ERAT Miss (Data TLB Access) page size 16G. ++event:0x3d154 counters:2 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_16M : Marked Data ERAT Miss (Data TLB Access) page size 16M. ++event:0x1d156 counters:0 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_4K : Marked Data ERAT Miss (Data TLB Access) page size 4K. ++event:0x2d154 counters:1 um:zero minimum:1000 name:PM_MRK_DERAT_MISS_64K : Marked Data ERAT Miss (Data TLB Access) page size 64K. ++event:0x20132 counters:1 um:zero minimum:1000 name:PM_MRK_DFU_FIN : Decimal Unit marked Instruction Finish. ++event:0x4f148 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. ++event:0x3f148 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. ++event:0x3f14c counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a marked data side request. ++event:0x4f14c counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_DMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a marked data side request. ++event:0x1f142 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2 : A Page Table Entry was loaded into the TLB from local core's L2 due to a marked data side request. ++event:0x4f146 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L21_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a marked data side request. ++event:0x3f146 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L21_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a marked data side request. ++event:0x1f14e counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L2 due to a marked data side request. ++event:0x3f140 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST : A Page Table Entry was loaded into the TLB from local core's L2 with load hit store conflict due to a marked data side request. ++event:0x4f140 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_OTHER : A Page Table Entry was loaded into the TLB from local core's L2 with dispatch conflict due to a marked data side request. ++event:0x2f140 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_MEPF : A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked data side request. ++event:0x1f140 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L2_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a marked data side request. ++event:0x4f142 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3 : A Page Table Entry was loaded into the TLB from local core's L3 due to a marked data side request. ++event:0x4f144 counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_ECO_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a marked data side request. ++event:0x3f144 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_ECO_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a marked data side request. ++event:0x2f144 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a marked data side request. ++event:0x1f146 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L31_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a marked data side request. ++event:0x4f14e counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3MISS : A Page Table Entry was loaded into the TLB from a localtion other than the local core's L3 due to a marked data side request. ++event:0x3f142 counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a marked data side request. ++event:0x2f142 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_MEPF : A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked data side request. ++event:0x1f144 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_L3_NO_CONFLICT : A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a marked data side request. ++event:0x1f14c counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_LL4 : A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a marked data side request. ++event:0x2f148 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_LMEM : A Page Table Entry was loaded into the TLB from the local chip's Memory due to a marked data side request. ++event:0x2f14c counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_MEMORY : A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a marked data side request. ++event:0x4f14a counters:3 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked data side request. ++event:0x1f148 counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_ON_CHIP_CACHE : A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request. ++event:0x2f146 counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL2L3_MOD : A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. ++event:0x1f14a counters:0 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL2L3_SHR : A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. ++event:0x2f14a counters:1 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RL4 : A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a marked data side request. ++event:0x3f14a counters:2 um:zero minimum:1000 name:PM_MRK_DPTEG_FROM_RMEM : A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a marked data side request. ++event:0x1d158 counters:0 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_16G : Marked Data TLB Miss page size 16G. ++event:0x4d156 counters:3 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_16M : Marked Data TLB Miss page size 16M. ++event:0x2d156 counters:1 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_4K : Marked Data TLB Miss page size 4k. ++event:0x3d156 counters:2 um:zero minimum:1000 name:PM_MRK_DTLB_MISS_64K : Marked Data TLB Miss page size 64K. ++event:0x40154 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_BKILL : Marked store had to do a bkill. ++event:0x2f150 counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_BKILL_CYC : cycles L2 RC took for a bkill. ++event:0x3015e counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_CLAIM_RTY : Sampled store did a rwitm and got a rty. ++event:0x30154 counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_DCLAIM : Marked store had to do a dclaim. ++event:0x2f152 counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_DCLAIM_CYC : cycles L2 RC took for a dclaim. ++event:0x30156 counters:2 um:zero minimum:1000 name:PM_MRK_FAB_RSP_MATCH : ttype and cresp matched as specified in MMCR1. ++event:0x4f152 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_MATCH_CYC : cresp/ttype match cycles. ++event:0x4015e counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RD_RTY : Sampled L2 reads retry count. ++event:0x1015e counters:0 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RD_T_INTV : Sampled Read got a T intervention. ++event:0x4f150 counters:3 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RWITM_CYC : cycles L2 RC took for a rwitm. ++event:0x2015e counters:1 um:zero minimum:1000 name:PM_MRK_FAB_RSP_RWITM_RTY : Sampled store did a rwitm and got a rty. ++event:0x3012e counters:2 um:zero minimum:1000 name:PM_MRK_FILT_MATCH : Marked filter Match. ++event:0x1013c counters:0 um:zero minimum:1000 name:PM_MRK_FIN_STALL_CYC : Marked instruction Finish Stall cycles (marked finish after NTC) (use edge detect to count #). ++event:0x20134 counters:1 um:zero minimum:1000 name:PM_MRK_FXU_FIN : fxu marked instr finish. + event:0x40130 counters:3 um:zero minimum:1000 name:PM_MRK_GRP_CMPL : marked instruction finished (completed). ++event:0x4013a counters:3 um:zero minimum:1000 name:PM_MRK_GRP_IC_MISS : Marked Group experienced I cache miss. ++event:0x3013c counters:2 um:zero minimum:1000 name:PM_MRK_GRP_NTC : Marked group ntc cycles. + event:0x20130 counters:1 um:zero minimum:1000 name:PM_MRK_INST_DECODED : marked instruction decoded. Name from ISU? ++event:0x30130 counters:2 um:zero minimum:1000 name:PM_MRK_INST_FIN : marked instr finish any unit . ++event:0x10132 counters:0 um:zero minimum:1000 name:PM_MRK_INST_ISSUED : Marked instruction issued. ++event:0x40134 counters:3 um:zero minimum:1000 name:PM_MRK_INST_TIMEO : marked Instruction finish timeout (instruction lost). + event:0x20114 counters:1 um:zero minimum:1000 name:PM_MRK_L2_RC_DISP : Marked Instruction RC dispatched in L2. ++event:0x3012a counters:2 um:zero minimum:1000 name:PM_MRK_L2_RC_DONE : Marked RC done. ++event:0x40116 counters:3 um:zero minimum:1000 name:PM_MRK_LARX_FIN : Larx finished . ++event:0x1013f counters:0 um:zero minimum:1000 name:PM_MRK_LD_MISS_EXPOSED : Marked Load exposed Miss (use edge detect to count #) ++event:0x1013e counters:0 um:zero minimum:1000 name:PM_MRK_LD_MISS_EXPOSED_CYC : Marked Load exposed Miss (use edge detect to count #). + event:0x4013e counters:3 um:zero minimum:1000 name:PM_MRK_LD_MISS_L1_CYC : Marked ld latency. ++event:0x40132 counters:3 um:zero minimum:1000 name:PM_MRK_LSU_FIN : lsu marked instr finish. ++event:0xd180 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH : Flush: (marked) : All Cases42 ++event:0xd188 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_LRQ : Flush: (marked) LRQMarked LRQ flushes ++event:0xd18a counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_SRQ : Flush: (marked) SRQMarked SRQ lhs flushes ++event:0xd184 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_ULD : Flush: (marked) Unaligned LoadMarked unaligned load flushes ++event:0xd186 counters:0,1,2,3 um:zero minimum:1000 name:PM_MRK_LSU_FLUSH_UST : Flush: (marked) Unaligned StoreMarked unaligned store flushes ++event:0x40164 counters:3 um:zero minimum:1000 name:PM_MRK_LSU_REJECT : LSU marked reject (up to 2 per cycle). ++event:0x30164 counters:2 um:zero minimum:1000 name:PM_MRK_LSU_REJECT_ERAT_MISS : LSU marked reject due to ERAT (up to 2 per cycle). ++event:0x20112 counters:1 um:zero minimum:1000 name:PM_MRK_NTF_FIN : Marked next to finish instruction finished. ++event:0x1d15e counters:0 um:zero minimum:10000 name:PM_MRK_RUN_CYC : Marked run cycles. ++event:0x1d15a counters:0 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_EFF : Marked src pref track was effective. ++event:0x3d15a counters:2 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked src. ++event:0x4d15c counters:3 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD : Prefetch tracked was moderate for marked src. ++event:0x1d15c counters:0 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD_L2 : Marked src Prefetch Tracked was moderate (source L2). ++event:0x3d15c counters:2 um:zero minimum:1000 name:PM_MRK_SRC_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked src. + event:0x3013e counters:2 um:zero minimum:1000 name:PM_MRK_STALL_CMPLU_CYC : Marked Group Completion Stall cycles (use edge detect to count #). ++event:0x3e158 counters:2 um:zero minimum:1000 name:PM_MRK_STCX_FAIL : marked stcx failed. ++event:0x30134 counters:2 um:zero minimum:1000 name:PM_MRK_ST_CMPL_INT : marked store complete (data home) with intervention. ++event:0x3f150 counters:2 um:zero minimum:1000 name:PM_MRK_ST_DRAIN_TO_L2DISP_CYC : cycles to drain st from core to L2. ++event:0x3012c counters:2 um:zero minimum:1000 name:PM_MRK_ST_FWD : Marked st forwards. ++event:0x1f150 counters:0 um:zero minimum:1000 name:PM_MRK_ST_L2DISP_TO_CMPL_CYC : cycles from L2 rc disp to l2 rc completion. ++event:0x20138 counters:1 um:zero minimum:1000 name:PM_MRK_ST_NEST : Marked store sent to nest. ++event:0x1c15a counters:0 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_EFF : Marked target pref track was effective. ++event:0x3c15a counters:2 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_INEFF : Prefetch tracked was ineffective for marked target. ++event:0x4c15c counters:3 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD : Prefetch tracked was moderate for marked target. ++event:0x1c15c counters:0 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD_L2 : Marked target Prefetch Tracked was moderate (source L2). ++event:0x3c15c counters:2 um:zero minimum:1000 name:PM_MRK_TGT_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3 hit) for marked target. ++event:0x30132 counters:2 um:zero minimum:1000 name:PM_MRK_VSU_FIN : vsu (fpu) marked instr finish. ++event:0x3d15e counters:2 um:zero minimum:10000 name:PM_MULT_MRK : mult marked instr. ++event:0x20b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_NESTED_TEND : Completion time nested tend + event:0x3006e counters:2 um:zero minimum:10000 name:PM_NEST_REF_CLK : Nest reference clocks. +-event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Cycles after all instructions have finished to group completed. ++event:0x20b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_NON_FAV_TBEGIN : Dispatch time non favored tbegin ++event:0x328084 counters:1 um:zero minimum:10000 name:PM_NON_TM_RST_SC : non tm snp rst tm sc ++event:0x2001a counters:1 um:zero minimum:10000 name:PM_NTCG_ALL_FIN : Ccycles after all instructions have finished to group completed. ++event:0x20ac counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TBEGIN : Completion time outer tbegin ++event:0x20ae counters:0,1,2,3 um:zero minimum:10000 name:PM_OUTER_TEND : Completion time outer tend + event:0x20010 counters:1 um:zero minimum:10000 name:PM_PMC1_OVERFLOW : Overflow from counter 1. + event:0x30010 counters:2 um:zero minimum:10000 name:PM_PMC2_OVERFLOW : Overflow from counter 2. ++event:0x30020 counters:2 um:zero minimum:10000 name:PM_PMC2_REWIND : PMC2 Rewind Event (did not match condition). ++event:0x10022 counters:0 um:zero minimum:10000 name:PM_PMC2_SAVED : PMC2 Rewind Value saved (matched condition). + event:0x40010 counters:3 um:zero minimum:10000 name:PM_PMC3_OVERFLOW : Overflow from counter 3. + event:0x10010 counters:0 um:zero minimum:10000 name:PM_PMC4_OVERFLOW : Overflow from counter 4. ++event:0x10020 counters:0 um:zero minimum:10000 name:PM_PMC4_REWIND : PMC4 Rewind Event (did not match condition). ++event:0x30022 counters:2 um:zero minimum:10000 name:PM_PMC4_SAVED : PMC4 Rewind Value saved (matched condition). ++event:0x10024 counters:0 um:zero minimum:10000 name:PM_PMC5_OVERFLOW : Overflow from counter 5. + event:0x30024 counters:2 um:zero minimum:10000 name:PM_PMC6_OVERFLOW : Overflow from counter 6. ++event:0x2005a counters:1 um:zero minimum:10000 name:PM_PREF_TRACKED : Total number of Prefetch Operations that were tracked. ++event:0x1005a counters:0 um:zero minimum:10000 name:PM_PREF_TRACK_EFF : Prefetch Tracked was effective. ++event:0x3005a counters:2 um:zero minimum:10000 name:PM_PREF_TRACK_INEFF : Prefetch tracked was ineffective. ++event:0x4005a counters:3 um:zero minimum:10000 name:PM_PREF_TRACK_MOD : Prefetch tracked was moderate. ++event:0x1005c counters:0 um:zero minimum:10000 name:PM_PREF_TRACK_MOD_L2 : Prefetch Tracked was moderate (source L2). ++event:0x3005c counters:2 um:zero minimum:10000 name:PM_PREF_TRACK_MOD_L3 : Prefetch tracked was moderate (L3). ++event:0x40014 counters:3 um:zero minimum:10000 name:PM_PROBE_NOP_DISP : ProbeNops dispatched. ++event:0xe084 counters:0,1,2,3 um:zero minimum:10000 name:PM_PTE_PREFETCH : PTE prefetches42 ++event:0x10054 counters:0 um:zero minimum:10000 name:PM_PUMP_CPRED : Pump prediction correct. Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x40052 counters:3 um:zero minimum:10000 name:PM_PUMP_MPRED : Pump Mis prediction Counts across all types of pumpsfor all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x16081 counters:0 um:zero minimum:10000 name:PM_RC0_ALLOC : 0.0 ++event:0x16080 counters:0 um:zero minimum:10000 name:PM_RC0_BUSY : RC mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) ++event:0x200301ea counters:2 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_1024 : Reload latency exceeded 1024 cyc ++event:0x200401ec counters:3 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_2048 : Threshold counter exceeded a value of 2048 ++event:0x200101e8 counters:0 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_256 : Threshold counter exceed a count of 256 ++event:0x200201e6 counters:1 um:zero minimum:10000 name:PM_RC_LIFETIME_EXC_32 : Reload latency exceeded 32 cyc ++event:0x36088 counters:2 um:zero minimum:10000 name:PM_RC_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x34808e counters:3 um:zero minimum:10000 name:PM_RD_CLEARING_SC : rd clearing sc ++event:0x34808c counters:3 um:zero minimum:10000 name:PM_RD_FORMING_SC : rd forming sc ++event:0x428086 counters:1 um:zero minimum:10000 name:PM_RD_HIT_PF : rd machine hit l3 pf machine ++event:0x20004 counters:1 um:zero minimum:10000 name:PM_REAL_SRQ_FULL : Out of real srq entries. ++event:0x3006c counters:2 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_MODE : Cycles run latch is set and core is in SMT2 mode. ++event:0x2006a counters:1 um:zero minimum:10000 name:PM_RUN_CYC_SMT2_SHRD_MODE : Cycles run latch is set and core is in SMT2-shared mode. ++event:0x1006a counters:0 um:zero minimum:100000 name:PM_RUN_CYC_SMT2_SPLIT_MODE : Cycles run latch is set and core is in SMT2-split mode. ++event:0x2006c counters:1 um:zero minimum:10000 name:PM_RUN_CYC_SMT4_MODE : Cycles run latch is set and core is in SMT4 mode. ++event:0x4006c counters:3 um:zero minimum:100000 name:PM_RUN_CYC_SMT8_MODE : Cycles run latch is set and core is in SMT8 mode. ++event:0x1006c counters:0 um:zero minimum:100000 name:PM_RUN_CYC_ST_MODE : Cycles run latch is set and core is in ST mode. ++event:0x10008 counters:0 um:zero minimum:10000 name:PM_RUN_SPURR : Run SPURR. ++event:0xf082 counters:0,1,2,3 um:zero minimum:10000 name:PM_SEC_ERAT_HIT : secondary ERAT Hit42 ++event:0x508c counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_CREATED : Store-Hit-Load Table Entry Created ++event:0x508e counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_CONVERT : Store-Hit-Load Table Read Hit with entry Enabled ++event:0x5090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SHL_ST_DISABLE : Store-Hit-Load Table Read Hit with entry Disabled (entry was disabled due to the entry shown to not prevent the flush) ++event:0x26085 counters:1 um:zero minimum:10000 name:PM_SN0_ALLOC : 0.0 ++event:0x26084 counters:1 um:zero minimum:10000 name:PM_SN0_BUSY : SN mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point) ++event:0xd0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_SNOOP_TLBIE : TLBIE snoopSnoop TLBIE ++event:0x338088 counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_M : snp tm st hit m mu ++event:0x33808a counters:2 um:zero minimum:10000 name:PM_SNP_TM_HIT_T : snp tm_st_hit t tn te ++event:0x4608c counters:3 um:zero minimum:10000 name:PM_SN_USAGE : Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 SN machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running ++event:0x10028 counters:0 um:zero minimum:10000 name:PM_STALL_END_GCT_EMPTY : Count ended because GCT went empty. ++event:0x1e058 counters:0 um:zero minimum:10000 name:PM_STCX_FAIL : stcx failed . ++event:0xc090 counters:0,1,2,3 um:zero minimum:10000 name:PM_STCX_LSU : STCX executed reported at sent to nest42 ++event:0x717080 counters:0 um:zero minimum:10000 name:PM_ST_CAUSED_FAIL : Non TM St caused any thread to fail ++event:0x20016 counters:1 um:zero minimum:10000 name:PM_ST_CMPL : Store completion count. ++event:0x20018 counters:1 um:zero minimum:10000 name:PM_ST_FWD : Store forwards that finished. ++event:0x0 counters:0,1,2,3 um:zero minimum:10000 name:PM_SUSPENDED : Counter OFF. ++event:0x3090 counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_CANCEL : SWAP cancel , rtag not available ++event:0x3092 counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_CANCEL_GPR : SWAP cancel , rtag not available for gpr ++event:0x308c counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_COMPLETE : swap cast in completed ++event:0x308e counters:0,1,2,3 um:zero minimum:10000 name:PM_SWAP_COMPLETE_GPR : swap cast in completed fpr gpr ++event:0x15152 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_BR_LINK : Marked Branch and link branch that can cause a synchronous interrupt. ++event:0x1515c counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_BR_MPRED : Marked Branch mispredict that can cause a synchronous interrupt. ++event:0x15156 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_FX_DIVIDE : Marked fixed point divide that can cause a synchronous interrupt. ++event:0x15158 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L2HIT : Marked L2 Hits that can throw a synchronous interrupt. ++event:0x1515a counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L2MISS : Marked L2 Miss that can throw a synchronous interrupt. ++event:0x15154 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_L3MISS : Marked L3 misses that can throw a synchronous interrupt. ++event:0x15150 counters:0 um:zero minimum:10000 name:PM_SYNC_MRK_PROBE_NOP : Marked probeNops which can cause synchronous interrupts. ++event:0x30050 counters:2 um:zero minimum:10000 name:PM_SYS_PUMP_CPRED : Initial and Final Pump Scope and data sourced across this scope was system pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x30052 counters:2 um:zero minimum:10000 name:PM_SYS_PUMP_MPRED : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope(Chip/Group) OR Final Pump Scope(system) got data from source that was at smaller scope(Chip/group) Final pump was system pump and initial pump was chip or group or ++event:0x40050 counters:3 um:zero minimum:10000 name:PM_SYS_PUMP_MPRED_RTY : Final Pump Scope(system) to get data sourced, ended up larger than Initial Pump Scope (Chip or Group) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate). ++event:0x10026 counters:0 um:zero minimum:10000 name:PM_TABLEWALK_CYC : Tablewalk Active. ++event:0xe086 counters:0,1,2,3 um:zero minimum:10000 name:PM_TABLEWALK_CYC_PREF : tablewalk qualified for pte prefetches42 ++event:0x20b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_TABORT_TRECLAIM : Completion time tabortnoncd, tabortcd, treclaim ++event:0xe0ba counters:0,1,2,3 um:zero minimum:10000 name:PM_TEND_PEND_CYC : TEND latency per thread42 + event:0x2000c counters:1 um:zero minimum:100000 name:PM_THRD_ALL_RUN_CYC : All Threads in Run_cycles (was both threads in run_cycles). ++event:0x10012 counters:0 um:zero minimum:10000 name:PM_THRD_GRP_CMPL_BOTH_CYC : Two threads finished same cycle (gated by run latch). ++event:0x40bc counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_0_1_CYC : Cycles thread running at priority level 0 or 1 ++event:0x40be counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_2_3_CYC : Cycles thread running at priority level 2 or 3 ++event:0x5080 counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_4_5_CYC : Cycles thread running at priority level 4 or 5 ++event:0x5082 counters:0,1,2,3 um:zero minimum:1000 name:PM_THRD_PRIO_6_7_CYC : Cycles thread running at priority level 6 or 7 ++event:0x3098 counters:0,1,2,3 um:zero minimum:10000 name:PM_THRD_REBAL_CYC : cycles rebalance was active + event:0x4016e counters:3 um:zero minimum:10000 name:PM_THRESH_NOT_MET : Threshold counter did not meet threshold. ++event:0x30058 counters:2 um:zero minimum:10000 name:PM_TLBIE_FIN : tlbie finished. ++event:0x20066 counters:1 um:zero minimum:10000 name:PM_TLB_MISS : TLB Miss (I + D). ++event:0x20b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_BEGIN_ALL : Tm any tbegin ++event:0x318082 counters:0 um:zero minimum:10000 name:PM_TM_CAM_OVERFLOW : l3 tm cam overflow during L2 co of SC ++event:0x74708c counters:3 um:zero minimum:10000 name:PM_TM_CAP_OVERFLOW : TM Footprint Capactiy Overflow ++event:0x20ba counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_END_ALL : Tm any tend ++event:0x3086 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CONF_NON_TM : TEXAS fail reason @ completion ++event:0x3088 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_CON_TM : TEXAS fail reason @ completion ++event:0xe0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_DISALLOW : TM fail disallow42 ++event:0x3084 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_FOOTPRINT_OVERFLOW : TEXAS fail reason @ completion ++event:0xe0b8 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_NON_TX_CONFLICT : Non transactional conflict from LSU whtver gets repoted to texas42 ++event:0x308a counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_SELF : TEXAS fail reason @ completion ++event:0xe0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TLBIE : TLBIE hit bloom filter42 ++event:0xe0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_FAIL_TX_CONFLICT : Transactional conflict from LSU, whatever gets reported to texas 42 ++event:0x727086 counters:1 um:zero minimum:10000 name:PM_TM_FAV_CAUSED_FAIL : TM Load (fav) caused another thread to fail ++event:0x717082 counters:0 um:zero minimum:10000 name:PM_TM_LD_CAUSED_FAIL : Non TM Ld caused any thread to fail ++event:0x727084 counters:1 um:zero minimum:10000 name:PM_TM_LD_CONF : TM Load (fav or non-fav) ran into conflict (failed) ++event:0x328086 counters:1 um:zero minimum:10000 name:PM_TM_RST_SC : tm snp rst tm sc ++event:0x318080 counters:0 um:zero minimum:10000 name:PM_TM_SC_CO : l3 castout tm Sc line ++event:0x73708a counters:2 um:zero minimum:10000 name:PM_TM_ST_CAUSED_FAIL : TM Store (fav or non-fav) caused another thread to fail ++event:0x737088 counters:2 um:zero minimum:10000 name:PM_TM_ST_CONF : TM Store (fav or non-fav) ran into conflict (failed) ++event:0x20bc counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TBEGIN : Tm nested tbegin ++event:0x10060 counters:0 um:zero minimum:10000 name:PM_TM_TRANS_RUN_CYC : run cycles in transactional state. ++event:0x30060 counters:2 um:zero minimum:10000 name:PM_TM_TRANS_RUN_INST : Instructions completed in transactional state. ++event:0x3080 counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TRESUME : Tm resume ++event:0x20be counters:0,1,2,3 um:zero minimum:10000 name:PM_TM_TSUSPEND : Tm suspend ++event:0x2e012 counters:1 um:zero minimum:10000 name:PM_TM_TX_PASS_RUN_CYC : run cycles spent in successful transactions. ++event:0x4e014 counters:3 um:zero minimum:10000 name:PM_TM_TX_PASS_RUN_INST : run instructions spent in successful transactions. ++event:0xe08c counters:0,1,2,3 um:zero minimum:10000 name:PM_UP_PREF_L3 : Micropartition prefetch42 ++event:0xe08e counters:0,1,2,3 um:zero minimum:10000 name:PM_UP_PREF_POINTER : Micrpartition pointer prefetches42 ++event:0xa0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_16FLOP : Sixteen flops operation (SP vector versions of fdiv,fsqrt) ++event:0xa080 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_1FLOP : one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation finishedDecode into 1,2,4 FLOP according to instr IOP, multiplied by #vector elements according to route( eg x1, x2, x4) Only if instr sends finish to ISU ++event:0xa098 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_2FLOP : two flops operation (scalar fmadd, fnmadd, fmsub, fnmsub and DP vector versions of single flop instructions) ++event:0xa09c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_4FLOP : four flops operation (scalar fdiv, fsqrt, DP vector version of fmadd, fnmadd, fmsub, fnmsub, SP vector versions of single flop instructions) ++event:0xa0a0 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_8FLOP : eight flops operation (DP vector versions of fdiv,fsqrt and SP vector versions of fmadd,fnmadd,fmsub,fnmsub) ++event:0xb0a4 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_COMPLEX_ISSUED : Complex VMX instruction issued ++event:0xb0b4 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_CY_ISSUED : Cryptographic instruction RFC02196 Issued ++event:0xb0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DD_ISSUED : 64BIT Decimal Issued ++event:0xa08c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DP_2FLOP : DP vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres ,fsqrte, fneg ++event:0xa090 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DP_FMA : DP vector version of fmadd,fnmadd,fmsub,fnmsub ++event:0xa094 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DP_FSQRT_FDIV : DP vector versions of fdiv,fsqrt ++event:0xb0ac counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_DQ_ISSUED : 128BIT Decimal Issued ++event:0xb0b0 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_EX_ISSUED : Direct move 32/64b VRFtoGPR RFC02206 Issued ++event:0xa0bc counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FIN : VSU0 Finished an instruction ++event:0xa084 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FMA : two flops operation (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only! ++event:0xb098 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FPSCR : Move to/from FPSCR type instruction issued on Pipe 0 ++event:0xa088 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_FSQRT_FDIV : four flops operation (fdiv,fsqrt) Scalar Instructions only! ++event:0xb090 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_PERMUTE_ISSUED : Permute VMX Instruction Issued ++event:0xb088 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SCALAR_DP_ISSUED : Double Precision scalar instruction issued on Pipe0 ++event:0xb094 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SIMPLE_ISSUED : Simple VMX instruction issued ++event:0xa0a8 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SINGLE : FPU single precision ++event:0xb09c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_SQ : Store Vector Issued ++event:0xb08c counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_STF : FPU store (SP or DP) issued on Pipe0 ++event:0xb080 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_VECTOR_DP_ISSUED : Double Precision vector instruction issued on Pipe0 ++event:0xb084 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU0_VECTOR_SP_ISSUED : Single Precision vector instruction issued (executed) ++event:0xa0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_16FLOP : Sixteen flops operation (SP vector versions of fdiv,fsqrt) ++event:0xa082 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_1FLOP : one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation finished ++event:0xa09a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_2FLOP : two flops operation (scalar fmadd, fnmadd, fmsub, fnmsub and DP vector versions of single flop instructions) ++event:0xa09e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_4FLOP : four flops operation (scalar fdiv, fsqrt, DP vector version of fmadd, fnmadd, fmsub, fnmsub, SP vector versions of single flop instructions) ++event:0xa0a2 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_8FLOP : eight flops operation (DP vector versions of fdiv,fsqrt and SP vector versions of fmadd,fnmadd,fmsub,fnmsub) ++event:0xb0a6 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_COMPLEX_ISSUED : Complex VMX instruction issued ++event:0xb0b6 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_CY_ISSUED : Cryptographic instruction RFC02196 Issued ++event:0xb0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DD_ISSUED : 64BIT Decimal Issued ++event:0xa08e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DP_2FLOP : DP vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres ,fsqrte, fneg ++event:0xa092 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DP_FMA : DP vector version of fmadd,fnmadd,fmsub,fnmsub ++event:0xa096 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DP_FSQRT_FDIV : DP vector versions of fdiv,fsqrt ++event:0xb0ae counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_DQ_ISSUED : 128BIT Decimal Issued ++event:0xb0b2 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_EX_ISSUED : Direct move 32/64b VRFtoGPR RFC02206 Issued ++event:0xa0be counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FIN : VSU1 Finished an instruction ++event:0xa086 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FMA : two flops operation (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only! ++event:0xb09a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FPSCR : Move to/from FPSCR type instruction issued on Pipe 0 ++event:0xa08a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_FSQRT_FDIV : four flops operation (fdiv,fsqrt) Scalar Instructions only! ++event:0xb092 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_PERMUTE_ISSUED : Permute VMX Instruction Issued ++event:0xb08a counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SCALAR_DP_ISSUED : Double Precision scalar instruction issued on Pipe1 ++event:0xb096 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SIMPLE_ISSUED : Simple VMX instruction issued ++event:0xa0aa counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SINGLE : FPU single precision ++event:0xb09e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_SQ : Store Vector Issued ++event:0xb08e counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_STF : FPU store (SP or DP) issued on Pipe1 ++event:0xb082 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_VECTOR_DP_ISSUED : Double Precision vector instruction issued on Pipe1 ++event:0xb086 counters:0,1,2,3 um:zero minimum:10000 name:PM_VSU1_VECTOR_SP_ISSUED : Single Precision vector instruction issued (executed) diff --git a/SOURCES/oprofile-ppc64-equivalent.patch b/SOURCES/oprofile-ppc64-equivalent.patch new file mode 100644 index 0000000..3c5b063 --- /dev/null +++ b/SOURCES/oprofile-ppc64-equivalent.patch @@ -0,0 +1,140 @@ +commit 4f5a0d9c4419f3b88586d665272eb35f270a0551 +Author: Maynard Johnson +Date: Tue Dec 17 16:04:33 2013 -0600 + + Allow all native events for IBM POWER8 in POWER7 compat mode + + Certain older Linux distributions will support the new IBM POWER8 + processor, but only in a limited mode, since much of the new + kernel code needed to fully support the POWER8 was not backported + to these older distros. This limited mode is referred to as + "POWER7 compat mode" since the kernel can support only the features + that were also available on that earlier IBM processor. + + Changes I originally made to support POWER8 assumed that there + would not be full POWER8 performance monitor unit capabilities when + in POWER7 compat mode, and thus, the current oprofile code supports + only a limited subset of POWER8 events (i.e., events which were also + available on the POWER7). However, I've recently been made aware + that these older distros actually do have complete backports of the + POWER8 perf_events kernel subsystem code, making them fully aware of + all POWER8 events. This patch allows operf and ocount to use all + of the POWER8 events, regardless of what mode or distribution we + are running on. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 4bb34b7..cd75ad4 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -290,7 +290,16 @@ static op_cpu _try_ppc64_arch_generic_cpu(void) + } + } + if (!platforms_are_equivalent) { +- if (strcmp(platform, "power7") == 0) ++ // FIXME ++ /* For POWER8 running in POWER7 compat mode (RHEL 6.5 and SLES 11 SP4), ++ * the kernel will have enough POWER8-specific PMU code so we can utilize ++ * all of the POWER8 events. In general, this is not necessarily the case ++ * when running in compat mode. This code needs to be inspected for every ++ * new IBM Power processor released, but for now, we'll assume that for the ++ * next processor model (assuming there will be something like a POWER9?), ++ * we should use just the architected events when running POWER8 compat mode. ++ */ ++ if (strcmp(platform, "power8") == 0) + cpu_type = CPU_PPC64_ARCH_V1; + } + } +commit 88ed74bade0096042d643a6d7e68c2cbc4b6e34d +Author: Maynard Johnson +Date: Thu Jan 9 15:07:21 2014 -0600 + + Fix "Unable to open cpu_type file for reading" for IBM POWER7+ + + Using operf to do profiling on an IBM POWER7+ may result in + the following error message: + + Unable to open cpu_type file for reading + + This patch fixes the problem. There is also a simple workaround of + running 'opcontrol --init'. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index cd75ad4..7d5262c 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -326,6 +326,8 @@ static op_cpu _get_ppc64_cpu_type(void) + for (i = 0; i < (int)len ; i++) + cpu_name_lowercase[i] = tolower(cpu_name[i]); + ++ if (strncmp(cpu_name_lowercase, "power7+", 7) == 0) ++ cpu_name_lowercase[6] = '\0'; + cpu_type_str[0] = '\0'; + strcat(cpu_type_str, "ppc64/"); + strncat(cpu_type_str, cpu_name_lowercase, len); +commit 65176cb1af0fb1f6c7d3ddba4ab5f5f23c5f7c62 +Author: Maynard Johnson +Date: Tue Jan 21 14:43:02 2014 -0600 + + Fix regression in IBM POWER8 running in POWER7 compat mode + + A commit made on Dec 17, 2013 ("Allow all native events for IBM POWER8 + in POWER7 compat mode) broke support for POWER8 in POWER7 compat mode. + Instead, oprofile attempts to treat it as a normal POWER7 processor, + which is not correct. A user reported the following error when + running operf with the default CYCLES event: + + terminate called after throwing an instance of 'std::runtime_error' + what(): libpfm cannot find event code for CYCLES; cannot continue + Aborted + + This patch fixes this problem. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index a3ad804..2907f36 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -300,7 +300,9 @@ static op_cpu _try_ppc64_arch_generic_cpu(void) + * next processor model (assuming there will be something like a POWER9?), + * we should use just the architected events when running POWER8 compat mode. + */ +- if (strcmp(platform, "power8") == 0) ++ if ((strcmp(platform, "power7") == 0) && (strcmp(base_platform, "power8") == 0)) ++ cpu_type = CPU_PPC64_POWER8; ++ else + cpu_type = CPU_PPC64_ARCH_V1; + } + } +commit 7243fa4ed8a25c6e59225a863fd263ce70989087 +Author: Maynard Johnson +Date: Tue Feb 4 08:27:10 2014 -0600 + + Make cpu type POWER8E equivalent to POWER8 + + Recent mainline kernel changes resulted in a cpu type of + "POWER8E" being displayed in /proc/cpuinfo for certain revisions + of the IBM POWER8 processor model. But for profiling and + counting of native events, we can ignore the differences between + POWER8 and POWER8E. This patch addresses that issue. + + Signed-off-by: Maynard Johnson + +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 2907f36..1ae2913 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -331,6 +331,9 @@ static op_cpu _get_ppc64_cpu_type(void) + + if (strncmp(cpu_name_lowercase, "power7+", 7) == 0) + cpu_name_lowercase[6] = '\0'; ++ if (strncmp(cpu_name_lowercase, "power8e", 7) == 0) ++ cpu_name_lowercase[6] = '\0'; ++ + cpu_type_str[0] = '\0'; + strcat(cpu_type_str, "ppc64/"); + strncat(cpu_type_str, cpu_name_lowercase, len); diff --git a/SOURCES/oprofile-ppc64le.patch b/SOURCES/oprofile-ppc64le.patch new file mode 100644 index 0000000..31dd258 --- /dev/null +++ b/SOURCES/oprofile-ppc64le.patch @@ -0,0 +1,50 @@ +commit a265c549bff149f5e9064dca7d06b6689fb3d64e +Author: Maynard Johnson +Date: Thu Jan 9 15:47:09 2014 -0600 + + Enable oprofile for new ppc64le architecture + + Signed-off-by: Maynard Johnson + +diff --git a/configure.ac b/configure.ac +index 457145a..1e3a65f 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -184,7 +184,7 @@ AC_DEFINE_UNQUOTED(HAVE_PERF_EVENTS, $HAVE_PERF_EVENTS, [Kernel support for perf + AC_CANONICAL_HOST + if test "$HAVE_PERF_EVENTS" = "1"; then + PFM_LIB= +- if test "$host_cpu" = "powerpc64"; then ++ if test "$host_cpu" = "powerpc64le" -o "$host_cpu" = "powerpc64"; then + AC_CHECK_HEADER(perfmon/pfmlib.h,,[AC_MSG_ERROR([pfmlib.h not found; usually provided in papi devel package])]) + AC_CHECK_LIB(pfm,pfm_get_os_event_encoding, HAVE_LIBPFM3='0'; HAVE_LIBPFM='1', [ + AC_CHECK_LIB(pfm, pfm_get_event_name, HAVE_LIBPFM3='1'; HAVE_LIBPFM='1', +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 7d5262c..15c71ab 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -621,7 +621,8 @@ static op_cpu __get_cpu_type_alt_method(void) + fnmatch("i?86", uname_info.machine, 0) == 0) { + return _get_x86_64_cpu_type(); + } +- if (strncmp(uname_info.machine, "ppc64", 5) == 0) { ++ if ((strncmp(uname_info.machine, "ppc64", 5) == 0) || ++ (strncmp(uname_info.machine, "ppc64le", 7) == 0)) { + return _get_ppc64_cpu_type(); + } + if (strncmp(uname_info.machine, "arm", 3) == 0 || +diff --git a/libutil++/bfd_support.cpp b/libutil++/bfd_support.cpp +index 67edd09..4b744f8 100644 +--- a/libutil++/bfd_support.cpp ++++ b/libutil++/bfd_support.cpp +@@ -634,9 +634,7 @@ void bfd_info::translate_debuginfo_syms(asymbol ** dbg_syms, long nr_dbg_syms) + bool bfd_info::get_synth_symbols() + { + extern const bfd_target bfd_elf64_powerpc_vec; +- extern const bfd_target bfd_elf64_powerpcle_vec; +- bool is_elf64_powerpc_target = (abfd->xvec == &bfd_elf64_powerpc_vec) +- || (abfd->xvec == &bfd_elf64_powerpcle_vec); ++ bool is_elf64_powerpc_target = (abfd->xvec == &bfd_elf64_powerpc_vec); + + if (!is_elf64_powerpc_target) + return false; diff --git a/SOURCES/oprofile-rhbz1121205.patch b/SOURCES/oprofile-rhbz1121205.patch new file mode 100644 index 0000000..5a14bff --- /dev/null +++ b/SOURCES/oprofile-rhbz1121205.patch @@ -0,0 +1,1244 @@ +commit ebde58121d34e30f57ab173bf425244ce0712d48 +Author: Maynard Johnson +Date: Wed Oct 9 13:12:21 2013 -0500 + + Converge operf and ocount utility functions + + When the ocount tool was developed, a number of utility + functions were needed that were very similar to operf utility + functions, with just minor changes. The decision was made at + the time to copy these functions into ocount and change them + as needed. To avoid dual maintenance on very similar functions, + we should converge the two tools to use one common set of utility + functions. The main reason for not doing so in the first place + was to make it easier to review ocount patches and not have to + look at operf changes at the same time. + + Signed-off-by: Maynard Johnson + +diff --git a/Makefile.am b/Makefile.am +index 293114b..2fe8d2f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -19,9 +19,9 @@ SUBDIRS = \ + events \ + doc \ + gui \ ++ libpe_utils \ + libperf_events \ + pe_profiling \ +- libpe_utils \ + pe_counting \ + agents + #### ATTENTION #### +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index dc9459e..b85d175 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -52,7 +52,9 @@ extern op_cpu cpu_type; + + using namespace std; + +-static int _op_get_next_online_cpu(DIR * dir, struct dirent *entry) ++// Global functions ++ ++int op_pe_utils::op_get_next_online_cpu(DIR * dir, struct dirent *entry) + { + #define OFFLINE 0x30 + unsigned int cpu_num; +@@ -86,8 +88,6 @@ static int _op_get_next_online_cpu(DIR * dir, struct dirent *entry) + return cpu_num; + } + +-// Global functions +- + int op_pe_utils::op_get_sys_value(const char * filename) + { + char str[10]; +@@ -148,7 +148,7 @@ int op_pe_utils::op_get_cpu_for_perf_events_cap(void) + goto error; + } else { + struct dirent *entry = NULL; +- retval = _op_get_next_online_cpu(dir, entry); ++ retval = op_get_next_online_cpu(dir, entry); + closedir(dir); + } + } else { +@@ -310,40 +310,6 @@ int op_pe_utils::op_validate_app_name(char ** app, char ** save_appname) + + out: return rc; + } +-static int _get_next_online_cpu(DIR * dir, struct dirent *entry) +-{ +-#define OFFLINE 0x30 +- unsigned int cpu_num; +- char cpu_online_pathname[40]; +- int res; +- FILE * online; +- again: +- do { +- entry = readdir(dir); +- if (!entry) +- return -1; +- } while (entry->d_type != DT_DIR); +- +- res = sscanf(entry->d_name, "cpu%u", &cpu_num); +- if (res <= 0) +- goto again; +- +- errno = 0; +- snprintf(cpu_online_pathname, 40, "/sys/devices/system/cpu/cpu%u/online", cpu_num); +- if ((online = fopen(cpu_online_pathname, "r")) == NULL) { +- cerr << "Unable to open " << cpu_online_pathname << endl; +- if (errno) +- cerr << strerror(errno) << endl; +- return -1; +- } +- res = fgetc(online); +- fclose(online); +- if (res == OFFLINE) +- goto again; +- else +- return cpu_num; +-} +- + + set op_pe_utils::op_get_available_cpus(int max_num_cpus) + { +@@ -392,7 +358,7 @@ set op_pe_utils::op_get_available_cpus(int max_num_cpus) + if (all_cpus_avail) { + available_cpus.insert(cpu); + } else { +- real_cpu = _get_next_online_cpu(dir, entry); ++ real_cpu = op_get_next_online_cpu(dir, entry); + if (real_cpu < 0) { + err_msg = "Internal Error: Number of online cpus cannot be determined."; + rc = -1; +@@ -803,7 +769,8 @@ static bool convert_event_vals(vector * evt_vec) + + + +-void op_pe_utils::op_process_events_list(vector & passed_evts) ++void op_pe_utils::op_process_events_list(vector & passed_evts, ++ bool do_profiling, bool do_callgraph) + { + string cmd = OP_BINDIR; + +@@ -812,7 +779,9 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + << OP_MAX_EVENTS << "." << endl; + exit(EXIT_FAILURE); + } +- cmd += "/ophelp --check-events --ignore-count "; ++ cmd += "/ophelp --check-events "; ++ if (!do_profiling) ++ cmd += "--ignore-count "; + for (unsigned int i = 0; i < passed_evts.size(); i++) { + FILE * fp; + string full_cmd = cmd; +@@ -825,6 +794,8 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + event_spec = _handle_powerpc_event_spec(event_spec); + #endif + ++ if (do_callgraph) ++ full_cmd += " --callgraph=1 "; + full_cmd += event_spec; + fp = popen(full_cmd.c_str(), "r"); + if (fp == NULL) { +@@ -836,14 +807,21 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + pclose(fp); + cerr << "Error retrieving info for event " + << event_spec << endl; ++ if (do_callgraph) ++ cerr << "Note: When doing callgraph profiling, the sample count must be" ++ << endl << "15 times the minimum count value for the event." << endl; + exit(EXIT_FAILURE); + } + pclose(fp); + char * event_str = op_xstrndup(event_spec.c_str(), event_spec.length()); + operf_event_t event; + strncpy(event.name, strtok(event_str, ":"), OP_MAX_EVT_NAME_LEN - 1); ++ if (do_profiling) ++ event.count = atoi(strtok(NULL, ":")); ++ else ++ event.count = 0UL; + /* Event name is required in the event spec in order for +- * 'ophelp --check-events --ignore-count' to pass. But since unit mask ++ * 'ophelp --check-events' to pass. But since unit mask + * and domain control bits are optional, we need to ensure the result of + * strtok is valid. + */ +@@ -854,7 +832,6 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + int place = _OP_UM; + char * endptr = NULL; + event.evt_um = 0UL; +- event.count = 0UL; + event.no_kernel = 0; + event.no_user = 0; + event.throttled = false; +@@ -904,7 +881,7 @@ void op_pe_utils::op_process_events_list(vector & passed_evts) + #endif + } + +-void op_pe_utils::op_get_default_event(void) ++void op_pe_utils::op_get_default_event(bool do_callgraph) + { + operf_event_t dft_evt; + struct op_default_event_descr descr; +@@ -918,7 +895,18 @@ void op_pe_utils::op_get_default_event(void) + } + + memset(&dft_evt, 0, sizeof(dft_evt)); +- dft_evt.count = descr.count; ++ if (do_callgraph) { ++ struct op_event * _event; ++ op_events(cpu_type); ++ if ((_event = find_event_by_name(descr.name, 0, 0))) { ++ dft_evt.count = _event->min_count * CALLGRAPH_MIN_COUNT_SCALE; ++ } else { ++ cerr << "Error getting event info for " << descr.name << endl; ++ exit(EXIT_FAILURE); ++ } ++ } else { ++ dft_evt.count = descr.count; ++ } + dft_evt.evt_um = descr.um; + strncpy(dft_evt.name, descr.name, OP_MAX_EVT_NAME_LEN - 1); + _get_event_code(&dft_evt, cpu_type); +diff --git a/libpe_utils/op_pe_utils.h b/libpe_utils/op_pe_utils.h +index 400eed3..08b6fae 100644 +--- a/libpe_utils/op_pe_utils.h ++++ b/libpe_utils/op_pe_utils.h +@@ -18,11 +18,13 @@ + #include + + #include ++#include + + #include "op_cpu_type.h" + + #define OP_APPNAME_LEN 1024 + #define OP_MAX_EVENTS 24 ++#define CALLGRAPH_MIN_COUNT_SCALE 15 + + /* A macro to be used for ppc64 architecture-specific code. The '__powerpc__' macro + * is defined for both ppc64 and ppc32 architectures, so we must further qualify by +@@ -38,8 +40,10 @@ extern int op_check_perf_events_cap(bool use_cpu_minus_one); + extern int op_get_sys_value(const char * filename); + extern int op_get_cpu_for_perf_events_cap(void); + extern int op_validate_app_name(char ** app, char ** save_appname); +-extern void op_get_default_event(void); +-extern void op_process_events_list(std::vector & passed_evts); ++extern void op_get_default_event(bool do_callgraph); ++extern void op_process_events_list(std::vector & passed_evts, ++ bool do_profiling, bool do_callgraph); ++extern int op_get_next_online_cpu(DIR * dir, struct dirent *entry); + extern std::set op_get_available_cpus(int max_num_cpus); + } + +diff --git a/libperf_events/Makefile.am b/libperf_events/Makefile.am +index 7163610..cf5f434 100644 +--- a/libperf_events/Makefile.am ++++ b/libperf_events/Makefile.am +@@ -7,6 +7,7 @@ AM_CPPFLAGS = \ + -I ${top_srcdir}/libop \ + -I ${top_srcdir}/libdb \ + -I ${top_srcdir}/libperf_events \ ++ -I ${top_srcdir}/libpe_utils \ + @PERF_EVENT_FLAGS@ \ + @OP_CPPFLAGS@ + +diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp +index b4cceaa..319e859 100644 +--- a/libperf_events/operf_counter.cpp ++++ b/libperf_events/operf_counter.cpp +@@ -31,6 +31,7 @@ + #include "operf_process_info.h" + #include "op_libiberty.h" + #include "operf_stats.h" ++#include "op_pe_utils.h" + + + using namespace std; +@@ -645,7 +646,7 @@ void operf_record::setup() + } else if (all_cpus_avail) { + real_cpu = cpu; + } else { +- real_cpu = op_get_next_online_cpu(dir, entry); ++ real_cpu = op_pe_utils::op_get_next_online_cpu(dir, entry); + if (real_cpu < 0) { + err_msg = "Internal Error: Number of online cpus cannot be determined."; + rc = -1; +diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp +index 30e64d8..faed9a6 100644 +--- a/libperf_events/operf_utils.cpp ++++ b/libperf_events/operf_utils.cpp +@@ -65,161 +65,6 @@ static list unresolved_events; + static struct operf_transient trans; + static bool sfile_init_done; + +-/* Some architectures (e.g., ppc64) do not use the same event value (code) for oprofile +- * and for perf_events. The operf-record process requires event values that perf_events +- * understands, but the operf-read process requires oprofile event values. The purpose of +- * the following method is to map the operf-record event value to a value that +- * opreport can understand. +- */ +-#if PPC64_ARCH +-extern op_cpu cpu_type; +-#define NIL_CODE ~0U +- +-#if HAVE_LIBPFM3 +-static bool _get_codes_for_match(unsigned int pfm_idx, const char name[], +- vector * evt_vec) +-{ +- unsigned int num_events = evt_vec->size(); +- int tmp_code, ret; +- char evt_name[OP_MAX_EVT_NAME_LEN]; +- unsigned int events_converted = 0; +- for (unsigned int i = 0; i < num_events; i++) { +- operf_event_t event = (*evt_vec)[i]; +- if (event.evt_code != NIL_CODE) { +- events_converted++; +- continue; +- } +- memset(evt_name, 0, OP_MAX_EVT_NAME_LEN); +- if (!strcmp(event.name, "CYCLES")) { +- strcpy(evt_name ,"PM_CYC") ; +- } else if (strstr(event.name, "_GRP")) { +- string str = event.name; +- strncpy(evt_name, event.name, str.rfind("_GRP")); +- } else { +- strncpy(evt_name, event.name, strlen(event.name)); +- } +- if (strncmp(name, evt_name, OP_MAX_EVT_NAME_LEN)) +- continue; +- ret = pfm_get_event_code(pfm_idx, &tmp_code); +- if (ret != PFMLIB_SUCCESS) { +- string evt_name_str = event.name; +- string msg = "libpfm cannot find event code for " + evt_name_str + +- "; cannot continue"; +- throw runtime_error(msg); +- } +- event.evt_code = tmp_code; +- (*evt_vec)[i] = event; +- events_converted++; +- cverb << vrecord << "Successfully converted " << event.name << " to perf_event code " +- << hex << tmp_code << endl; +- } +- return (events_converted == num_events); +-} +-#else +-static bool _op_get_event_codes(vector * evt_vec) +-{ +- int ret, i; +- unsigned int num_events = evt_vec->size(); +- char evt_name[OP_MAX_EVT_NAME_LEN]; +- unsigned int events_converted = 0; +- uint64_t code[1]; +- +- typedef struct { +- uint64_t *codes; +- char **fstr; +- size_t size; +- int count; +- int idx; +- } pfm_raw_pmu_encode_t; +- +- pfm_raw_pmu_encode_t raw; +- raw.codes = code; +- raw.count = 1; +- raw.fstr = NULL; +- +- if (pfm_initialize() != PFM_SUCCESS) +- throw runtime_error("Unable to initialize libpfm; cannot continue"); +- +- for (unsigned int i = 0; i < num_events; i++) { +- operf_event_t event = (*evt_vec)[i]; +- if (event.evt_code != NIL_CODE) { +- events_converted++; +- continue; +- } +- memset(evt_name, 0, OP_MAX_EVT_NAME_LEN); +- if (!strcmp(event.name, "CYCLES")) { +- strcpy(evt_name ,"PM_CYC") ; +- } else if (strstr(event.name, "_GRP")) { +- string str = event.name; +- strncpy(evt_name, event.name, str.rfind("_GRP")); +- } else { +- strncpy(evt_name, event.name, strlen(event.name)); +- } +- +- memset(&raw, 0, sizeof(raw)); +- ret = pfm_get_os_event_encoding(evt_name, PFM_PLM3, PFM_OS_NONE, &raw); +- if (ret != PFM_SUCCESS) { +- string evt_name_str = event.name; +- string msg = "libpfm cannot find event code for " + evt_name_str + +- "; cannot continue"; +- throw runtime_error(msg); +- } +- +- event.evt_code = raw.codes[0]; +- (*evt_vec)[i] = event; +- events_converted++; +- cverb << vrecord << "Successfully converted " << event.name << " to perf_event code " +- << hex << event.evt_code << endl; +- } +- return (events_converted == num_events); +-} +-#endif +- +-bool OP_perf_utils::op_convert_event_vals(vector * evt_vec) +-{ +- unsigned int i, count; +- char name[256]; +- int ret; +- for (unsigned int i = 0; i < evt_vec->size(); i++) { +- operf_event_t event = (*evt_vec)[i]; +- if (cpu_type == CPU_PPC64_POWER7) { +- if (!strncmp(event.name, "PM_RUN_CYC", strlen("PM_RUN_CYC"))) { +- event.evt_code = 0x600f4; +- } else if (!strncmp(event.name, "PM_RUN_INST_CMPL", strlen("PM_RUN_INST_CMPL"))) { +- event.evt_code = 0x500fa; +- } else { +- event.evt_code = NIL_CODE; +- } +- } else { +- event.evt_code = NIL_CODE; +- } +- (*evt_vec)[i] = event; +- } +- +-#if HAVE_LIBPFM3 +- if (pfm_initialize() != PFMLIB_SUCCESS) +- throw runtime_error("Unable to initialize libpfm; cannot continue"); +- +- ret = pfm_get_num_events(&count); +- if (ret != PFMLIB_SUCCESS) +- throw runtime_error("Unable to use libpfm to obtain event code; cannot continue"); +- for(i =0 ; i < count; i++) +- { +- ret = pfm_get_event_name(i, name, 256); +- if (ret != PFMLIB_SUCCESS) +- continue; +- if (_get_codes_for_match(i, name, evt_vec)) +- break; +- } +- return (i != count); +-#else +- return _op_get_event_codes(evt_vec); +-#endif +-} +- +-#endif // PPC64_ARCH +- +- + static inline void update_trans_last(struct operf_transient * trans) + { + trans->last = trans->current; +@@ -1465,38 +1310,3 @@ void OP_perf_utils::op_get_kernel_event_data(struct mmap_data *md, operf_record + md->prev = old; + pc->data_tail = old; + } +- +- +-int OP_perf_utils::op_get_next_online_cpu(DIR * dir, struct dirent *entry) +-{ +-#define OFFLINE 0x30 +- unsigned int cpu_num; +- char cpu_online_pathname[40]; +- int res; +- FILE * online; +- again: +- do { +- entry = readdir(dir); +- if (!entry) +- return -1; +- } while (entry->d_type != DT_DIR); +- +- res = sscanf(entry->d_name, "cpu%u", &cpu_num); +- if (res <= 0) +- goto again; +- +- errno = 0; +- snprintf(cpu_online_pathname, 40, "/sys/devices/system/cpu/cpu%u/online", cpu_num); +- if ((online = fopen(cpu_online_pathname, "r")) == NULL) { +- cerr << "Unable to open " << cpu_online_pathname << endl; +- if (errno) +- cerr << strerror(errno) << endl; +- return -1; +- } +- res = fgetc(online); +- fclose(online); +- if (res == OFFLINE) +- goto again; +- else +- return cpu_num; +-} +diff --git a/libperf_events/operf_utils.h b/libperf_events/operf_utils.h +index 4c191fe..2a979e3 100644 +--- a/libperf_events/operf_utils.h ++++ b/libperf_events/operf_utils.h +@@ -87,8 +87,6 @@ int op_write_output(int output, void *buf, size_t size); + int op_write_event(event_t * event, u64 sample_type); + int op_read_from_stream(std::ifstream & is, char * buf, std::streamsize sz); + int op_mmap_trace_file(struct mmap_info & info, bool init); +-int op_get_next_online_cpu(DIR * dir, struct dirent *entry); +-bool op_convert_event_vals(std::vector * evt_vec); + void op_reprocess_unresolved_events(u64 sample_type, bool print_progress); + void op_release_resources(void); + } +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 5a85c3f..db847ea 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -720,9 +720,9 @@ static void process_args(int argc, char * const argv[]) + + if (ocount_options::evts.empty()) { + // Use default event +- op_pe_utils::op_get_default_event(); ++ op_pe_utils::op_get_default_event(false); + } else { +- op_pe_utils::op_process_events_list(ocount_options::evts); ++ op_pe_utils::op_process_events_list(ocount_options::evts, false, false); + } + cverb << vdebug << "Number of events passed is " << events.size() << endl; + return; +diff --git a/pe_profiling/Makefile.am b/pe_profiling/Makefile.am +index b27cbc7..8c232c4 100644 +--- a/pe_profiling/Makefile.am ++++ b/pe_profiling/Makefile.am +@@ -6,6 +6,7 @@ AM_CPPFLAGS = \ + -I ${top_srcdir}/libop \ + -I ${top_srcdir}/libutil++ \ + -I ${top_srcdir}/libperf_events \ ++ -I ${top_srcdir}/libpe_utils \ + @PERF_EVENT_FLAGS@ \ + @OP_CPPFLAGS@ + +@@ -15,7 +16,8 @@ AM_CXXFLAGS = @OP_CXXFLAGS@ + AM_LDFLAGS = @OP_LDFLAGS@ + + bin_PROGRAMS = operf +-operf_LDADD = ../libperf_events/libperf_events.a \ ++operf_LDADD = ../libperf_events/libperf_events.a \ ++ ../libpe_utils/libpe_utils.a \ + ../libutil++/libutil++.a \ + ../libdb/libodb.a \ + ../libop/libop.a \ +diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp +index 3fec123..89e9c4b 100644 +--- a/pe_profiling/operf.cpp ++++ b/pe_profiling/operf.cpp +@@ -35,6 +35,7 @@ + #include + #include + #include "operf_utils.h" ++#include "op_pe_utils.h" + #include "op_libiberty.h" + #include "string_manip.h" + #include "cverb.h" +@@ -50,6 +51,7 @@ + #include "op_netburst.h" + + using namespace std; ++using namespace op_pe_utils; + + typedef enum END_CODE { + ALL_OK = 0, +@@ -73,11 +75,11 @@ uid_t my_uid; + bool no_vmlinux; + int kptr_restrict; + char * start_time_human_readable; ++std::vector events; ++ + + #define DEFAULT_OPERF_OUTFILE "operf.data" +-#define CALLGRAPH_MIN_COUNT_SCALE 15 + +-static char full_pathname[PATH_MAX]; + static char * app_name_SAVE = NULL; + static char ** app_args = NULL; + static pid_t jitconv_pid = -1; +@@ -88,7 +90,6 @@ static string samples_dir; + static bool startApp; + static string outputfile; + static char start_time_str[32]; +-static vector events; + static bool jit_conversion_running; + static void convert_sample_data(void); + static int sample_data_pipe[2]; +@@ -948,517 +949,6 @@ out: + } + + +-static int find_app_file_in_dir(const struct dirent * d) +-{ +- if (!strcmp(d->d_name, app_name)) +- return 1; +- else +- return 0; +-} +- +-static int get_PATH_based_pathname(char * path_holder, size_t n) +-{ +- int retval = -1; +- +- char * real_path = getenv("PATH"); +- char * path = (char *) xstrdup(real_path); +- char * segment = strtok(path, ":"); +- while (segment) { +- struct dirent ** namelist; +- int rc = scandir(segment, &namelist, find_app_file_in_dir, NULL); +- if (rc < 0) { +- if (errno != ENOENT) { +- cerr << strerror(errno) << endl; +- cerr << app_name << " cannot be found in your PATH." << endl; +- break; +- } +- } else if (rc == 1) { +- size_t applen = strlen(app_name); +- size_t dirlen = strlen(segment); +- +- if (applen + dirlen + 2 > n) { +- cerr << "Path segment " << segment +- << " prepended to the passed app name is too long" +- << endl; +- retval = -1; +- break; +- } +- +- if (!strcmp(segment, ".")) { +- if (getcwd(path_holder, PATH_MAX) == NULL) { +- retval = -1; +- cerr << "getcwd [3] failed when processing /" << app_name << " found via PATH. Aborting." +- << endl; +- break; +- } +- } else { +- strncpy(path_holder, segment, dirlen); +- } +- strcat(path_holder, "/"); +- strncat(path_holder, app_name, applen); +- retval = 0; +- free(namelist[0]); +- free(namelist); +- +- break; +- } +- segment = strtok(NULL, ":"); +- } +- free(path); +- return retval; +-} +-int validate_app_name(void) +-{ +- int rc = 0; +- struct stat filestat; +- size_t len = strlen(app_name); +- +- if (len > (size_t) (OP_APPNAME_LEN - 1)) { +- cerr << "app name longer than max allowed (" << OP_APPNAME_LEN +- << " chars)\n"; +- cerr << app_name << endl; +- rc = -1; +- goto out; +- } +- +- if (index(app_name, '/') == app_name) { +- // Full pathname of app was specified, starting with "/". +- strncpy(full_pathname, app_name, len); +- } else if ((app_name[0] == '.') && (app_name[1] == '/')) { +- // Passed app is in current directory; e.g., "./myApp" +- if (getcwd(full_pathname, PATH_MAX) == NULL) { +- rc = -1; +- cerr << "getcwd [1] failed when trying to find app name " << app_name << ". Aborting." +- << endl; +- goto out; +- } +- strcat(full_pathname, "/"); +- if ((strlen(full_pathname) + strlen(app_name + 2) + 1) > PATH_MAX) { +- rc = -1; +- cerr << "Length of current dir (" << full_pathname << ") and app name (" +- << (app_name + 2) << ") exceeds max allowed (" << PATH_MAX << "). Aborting." +- << endl; +- goto out; +- } +- strcat(full_pathname, (app_name + 2)); +- } else if (index(app_name, '/')) { +- // Passed app is in a subdirectory of cur dir; e.g., "test-stuff/myApp" +- if (getcwd(full_pathname, PATH_MAX) == NULL) { +- rc = -1; +- cerr << "getcwd [2] failed when trying to find app name " << app_name << ". Aborting." +- << endl; +- goto out; +- } +- strcat(full_pathname, "/"); +- strcat(full_pathname, app_name); +- } else { +- // Passed app name, at this point, MUST be found in PATH +- rc = get_PATH_based_pathname(full_pathname, PATH_MAX); +- } +- +- if (rc) { +- cerr << "Problem finding app name " << app_name << ". Aborting." +- << endl; +- goto out; +- } +- app_name_SAVE = app_name; +- app_name = full_pathname; +- if (stat(app_name, &filestat)) { +- char msg[OP_APPNAME_LEN + 50]; +- snprintf(msg, OP_APPNAME_LEN + 50, "Non-existent app name \"%s\"", +- app_name); +- perror(msg); +- rc = -1; +- } +- +- out: return rc; +-} +- +-static void _get_event_code(operf_event_t * event) +-{ +- FILE * fp; +- char oprof_event_code[9]; +- string command; +- u64 base_code, config; +- char buf[20]; +- if ((snprintf(buf, 20, "%lu", event->count)) < 0) { +- cerr << "Error parsing event count of " << event->count << endl; +- exit(EXIT_FAILURE); +- } +- +- base_code = config = 0ULL; +- +- command = OP_BINDIR; +- command += "ophelp "; +- command += event->name; +- +- fp = popen(command.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- if (fgets(oprof_event_code, sizeof(oprof_event_code), fp) == NULL) { +- pclose(fp); +- cerr << "Unable to find info for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- +- pclose(fp); +- +- base_code = strtoull(oprof_event_code, (char **) NULL, 10); +- +- +-#if defined(__i386__) || defined(__x86_64__) +- // Setup EventSelct[11:8] field for AMD +- char mask[12]; +- const char * vendor_AMD = "AuthenticAMD"; +- if (op_is_cpu_vendor((char *)vendor_AMD)) { +- config = base_code & 0xF00ULL; +- config = config << 32; +- } +- +- // Setup EventSelct[7:0] field +- config |= base_code & 0xFFULL; +- +- // Setup unitmask field +-handle_named_um: +- if (event->um_name[0]) { +- command = OP_BINDIR; +- command += "ophelp "; +- command += "--extra-mask "; +- command += event->name; +- command += ":"; +- command += buf; +- command += ":"; +- command += event->um_name; +- fp = popen(command.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- if (fgets(mask, sizeof(mask), fp) == NULL) { +- pclose(fp); +- cerr << "Unable to find unit mask info for " << event->um_name << " for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- pclose(fp); +- // FIXME: The mask value here is the extra bits from the named unit mask. It's not +- // ideal to put that value into the UM's mask, since that's what will show up in +- // opreport. It would be better if we could somehow have the unit mask name that the +- // user passed to us show up in opreort. +- event->evt_um = strtoull(mask, (char **) NULL, 10); +- /* A value >= EXTRA_MIN_VAL returned by 'ophelp --extra-mask' is interpreted as a +- * valid extra value; otherwise we interpret it as a simple unit mask value +- * for a named unit mask with EXTRA_NONE. +- */ +- if (event->evt_um >= EXTRA_MIN_VAL) +- config |= event->evt_um; +- else +- config |= ((event->evt_um & 0xFFULL) << 8); +- } else if (!event->evt_um) { +- char * endptr; +- command.clear(); +- command = OP_BINDIR; +- command += "ophelp "; +- command += "--unit-mask "; +- command += event->name; +- command += ":"; +- command += buf; +- fp = popen(command.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get unit mask for event " +- << event->name << endl; +- exit(EXIT_FAILURE); +- } +- if (fgets(mask, sizeof(mask), fp) == NULL) { +- pclose(fp); +- cerr << "Unable to find unit mask info for event " << event->name << endl; +- exit(EXIT_FAILURE); +- } +- pclose(fp); +- event->evt_um = strtoull(mask, &endptr, 10); +- if ((endptr >= mask) && +- (endptr <= (mask + strlen(mask) - 1))) { +- // Must be a default named unit mask +- strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN); +- goto handle_named_um; +- } +- config |= ((event->evt_um & 0xFFULL) << 8); +- } else { +- config |= ((event->evt_um & 0xFFULL) << 8); +- } +-#else +- config = base_code; +-#endif +- +- event->op_evt_code = base_code; +- if (cpu_type == CPU_P4 || cpu_type == CPU_P4_HT2) { +- if (op_netburst_get_perf_encoding(event->name, event->evt_um, 1, 1, &config)) { +- cerr << "Unable to get event encoding for " << event->name << endl; +- exit(EXIT_FAILURE); +- } +- } +- event->evt_code = config; +-} +- +-#if PPC64_ARCH +-/* All ppc64 events (except CYCLES) have a _GRP suffix. This is +- * because the legacy opcontrol profiler can only profile events in +- * the same group (i.e., having the same _GRP suffix). But operf +- * can multiplex events, so we should allow the user to pass event +- * names without the _GRP suffix. +- * +- * If event name is not CYCLES or does not have a _GRP suffix, +- * we'll call ophelp and scan the list of events, searching for one +- * that matches up to the _GRP suffix. If we don't find a match, +- * then we'll exit with the expected error message for invalid event name. +- */ +-static string _handle_powerpc_event_spec(string event_spec) +-{ +- FILE * fp; +- char line[MAX_INPUT]; +- size_t grp_pos; +- string evt, retval, err_msg; +- size_t evt_name_len; +- bool first_non_cyc_evt_found = false; +- bool event_found = false; +- char event_name[OP_MAX_EVT_NAME_LEN], event_spec_str[OP_MAX_EVT_NAME_LEN + 20], * count_str; +- string cmd = OP_BINDIR; +- cmd += "/ophelp"; +- +- strncpy(event_spec_str, event_spec.c_str(), event_spec.length() + 1); +- +- strncpy(event_name, strtok(event_spec_str, ":"), OP_MAX_EVT_NAME_LEN); +- count_str = strtok(NULL, ":"); +- if (!count_str) { +- err_msg = "Invalid count for event "; +- goto out; +- } +- +- if (!strcmp("CYCLES", event_name)) { +- event_found = true; +- goto out; +- } +- +- evt = event_name; +- // Need to make sure the event name truly has a _GRP suffix. +- grp_pos = evt.rfind("_GRP"); +- if ((grp_pos != string::npos) && ((evt = evt.substr(grp_pos, string::npos))).length() > 4) { +- char * end; +- strtoul(evt.substr(4, string::npos).c_str(), &end, 0); +- if (end && (*end == '\0')) { +- // Valid group number found after _GRP, so we can skip to the end. +- event_found = true; +- goto out; +- } +- } +- +- // If we get here, it implies the user passed a non-CYCLES event without a GRP suffix. +- // Lets try to find a valid suffix for it. +- fp = popen(cmd.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event_spec << endl; +- exit(EXIT_FAILURE); +- } +- evt_name_len = strlen(event_name); +- err_msg = "Cannot find event "; +- while (fgets(line, MAX_INPUT, fp)) { +- if (!first_non_cyc_evt_found) { +- if (!strncmp(line, "PM_", 3)) +- first_non_cyc_evt_found = true; +- else +- continue; +- } +- if (line[0] == ' ' || line[0] == '\t') +- continue; +- if (!strncmp(line, event_name, evt_name_len)) { +- // Found a potential match. Check if it's a perfect match. +- string save_event_name = event_name; +- size_t full_evt_len = index(line, ':') - line; +- memset(event_name, '\0', OP_MAX_EVT_NAME_LEN); +- strncpy(event_name, line, full_evt_len); +- string candidate = event_name; +- if (candidate.rfind("_GRP") == evt_name_len) { +- event_found = true; +- break; +- } else { +- memset(event_name, '\0', OP_MAX_EVT_NAME_LEN); +- strncpy(event_name, save_event_name.c_str(), evt_name_len); +- } +- } +- } +- pclose(fp); +- +-out: +- if (!event_found) { +- cerr << err_msg << event_name << endl; +- cerr << "Error retrieving info for event " +- << event_spec << endl; +- exit(EXIT_FAILURE); +- } +- retval = event_name; +- return retval + ":" + count_str; +-} +-#endif +- +-static void _process_events_list(void) +-{ +- string cmd = OP_BINDIR; +- if (operf_options::evts.size() > OP_MAX_EVENTS) { +- cerr << "Number of events specified is greater than allowed maximum of " +- << OP_MAX_EVENTS << "." << endl; +- exit(EXIT_FAILURE); +- } +- cmd += "/ophelp --check-events "; +- for (unsigned int i = 0; i < operf_options::evts.size(); i++) { +- FILE * fp; +- string full_cmd = cmd; +- string event_spec = operf_options::evts[i]; +- +-#if PPC64_ARCH +- // Starting with CPU_PPC64_ARCH_V1, ppc64 events files are formatted like +- // other architectures, so no special handling is needed. +- if (cpu_type < CPU_PPC64_ARCH_V1) +- event_spec = _handle_powerpc_event_spec(event_spec); +-#endif +- +- if (operf_options::callgraph) { +- full_cmd += " --callgraph=1 "; +- } +- full_cmd += event_spec; +- fp = popen(full_cmd.c_str(), "r"); +- if (fp == NULL) { +- cerr << "Unable to execute ophelp to get info for event " +- << event_spec << endl; +- exit(EXIT_FAILURE); +- } +- if (fgetc(fp) == EOF) { +- pclose(fp); +- cerr << "Error retrieving info for event " +- << event_spec << endl; +- if (operf_options::callgraph) +- cerr << "Note: When doing callgraph profiling, the sample count must be" +- << endl << "15 times the minimum count value for the event." << endl; +- exit(EXIT_FAILURE); +- } +- pclose(fp); +- char * event_str = op_xstrndup(event_spec.c_str(), event_spec.length()); +- operf_event_t event; +- strncpy(event.name, strtok(event_str, ":"), OP_MAX_EVT_NAME_LEN - 1); +- event.count = atoi(strtok(NULL, ":")); +- /* Name and count are required in the event spec in order for +- * 'ophelp --check-events' to pass. But since unit mask and domain +- * control bits are optional, we need to ensure the result of strtok +- * is valid. +- */ +- char * info; +-#define _OP_UM 1 +-#define _OP_KERNEL 2 +-#define _OP_USER 3 +- int place = _OP_UM; +- char * endptr = NULL; +- event.evt_um = 0ULL; +- event.no_kernel = 0; +- event.no_user = 0; +- event.throttled = false; +- memset(event.um_name, '\0', OP_MAX_UM_NAME_LEN); +- while ((info = strtok(NULL, ":"))) { +- switch (place) { +- case _OP_UM: +- event.evt_um = strtoul(info, &endptr, 0); +- // If any of the UM part is not a number, then we +- // consider the entire part a string. +- if (*endptr) { +- event.evt_um = 0; +- strncpy(event.um_name, info, OP_MAX_UM_NAME_LEN - 1); +- } +- break; +- case _OP_KERNEL: +- if (atoi(info) == 0) +- event.no_kernel = 1; +- break; +- case _OP_USER: +- if (atoi(info) == 0) +- event.no_user = 1; +- break; +- } +- place++; +- } +- free(event_str); +- _get_event_code(&event); +- events.push_back(event); +- } +-#if PPC64_ARCH +- { +- /* For ppc64 architecture processors prior to the introduction of +- * architected_events_v1, the oprofile event code needs to be converted +- * to the appropriate event code to pass to the perf_event_open syscall. +- * But as of the introduction of architected_events_v1, the events +- * file contains the necessary event code information, so this conversion +- * step is no longer needed. +- */ +- +- using namespace OP_perf_utils; +- if ((cpu_type < CPU_PPC64_ARCH_V1) && !op_convert_event_vals(&events)) { +- cerr << "Unable to convert all oprofile event values to perf_event values" << endl; +- exit(EXIT_FAILURE); +- } +- } +-#endif +-} +- +-static void get_default_event(void) +-{ +- operf_event_t dft_evt; +- struct op_default_event_descr descr; +- vector tmp_events; +- +- +- op_default_event(cpu_type, &descr); +- if (descr.name[0] == '\0') { +- cerr << "Unable to find default event" << endl; +- exit(EXIT_FAILURE); +- } +- +- memset(&dft_evt, 0, sizeof(dft_evt)); +- if (operf_options::callgraph) { +- struct op_event * _event; +- op_events(cpu_type); +- if ((_event = find_event_by_name(descr.name, 0, 0))) { +- dft_evt.count = _event->min_count * CALLGRAPH_MIN_COUNT_SCALE; +- } else { +- cerr << "Error getting event info for " << descr.name << endl; +- exit(EXIT_FAILURE); +- } +- } else { +- dft_evt.count = descr.count; +- } +- dft_evt.evt_um = descr.um; +- strncpy(dft_evt.name, descr.name, OP_MAX_EVT_NAME_LEN - 1); +- _get_event_code(&dft_evt); +- events.push_back(dft_evt); +- +-#if PPC64_ARCH +- { +- /* This section of code is for architectures such as ppc[64] for which +- * the oprofile event code needs to be converted to the appropriate event +- * code to pass to the perf_event_open syscall. +- */ +- +- using namespace OP_perf_utils; +- if ((cpu_type < CPU_PPC64_ARCH_V1) && !op_convert_event_vals(&events)) { +- cerr << "Unable to convert all oprofile event values to perf_event values" << endl; +- exit(EXIT_FAILURE); +- } +- } +-#endif +-} +- + static void _process_session_dir(void) + { + if (operf_options::session_dir.empty()) { +@@ -1752,7 +1242,7 @@ static void process_args(int argc, char * const argv[]) + app_args = (char **) xmalloc((sizeof *app_args) * 2); + app_args[1] = NULL; + } +- if (validate_app_name() < 0) { ++ if (op_validate_app_name(&app_name, &app_name_SAVE) < 0) { + __print_usage_and_exit(NULL); + } + } else { // non_options_idx == 0 +@@ -1783,9 +1273,9 @@ static void process_args(int argc, char * const argv[]) + + if (operf_options::evts.empty()) { + // Use default event +- get_default_event(); ++ op_get_default_event(operf_options::callgraph); + } else { +- _process_events_list(); ++ op_process_events_list(operf_options::evts, true, operf_options::callgraph); + } + op_nr_events = events.size(); + +@@ -1800,87 +1290,6 @@ static void process_args(int argc, char * const argv[]) + return; + } + +-static int _get_cpu_for_perf_events_cap(void) +-{ +- int retval; +- string err_msg; +- char cpus_online[257]; +- FILE * online_cpus; +- DIR *dir = NULL; +- +- int total_cpus = sysconf(_SC_NPROCESSORS_ONLN); +- if (!total_cpus) { +- err_msg = "Internal Error (1): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } +- +- online_cpus = fopen("/sys/devices/system/cpu/online", "r"); +- if (!online_cpus) { +- err_msg = "Internal Error (2): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } +- memset(cpus_online, 0, sizeof(cpus_online)); +- +- if ( fgets(cpus_online, sizeof(cpus_online), online_cpus) == NULL) { +- fclose(online_cpus); +- err_msg = "Internal Error (3): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } +- +- if (!cpus_online[0]) { +- fclose(online_cpus); +- err_msg = "Internal Error (4): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- +- } +- if (index(cpus_online, ',') || cpus_online[0] != '0') { +- // A comma in cpus_online implies a gap, which in turn implies that not all +- // CPUs are online. +- if ((dir = opendir("/sys/devices/system/cpu")) == NULL) { +- fclose(online_cpus); +- err_msg = "Internal Error (5): Number of online cpus cannot be determined."; +- retval = -1; +- goto error; +- } else { +- struct dirent *entry = NULL; +- retval = OP_perf_utils::op_get_next_online_cpu(dir, entry); +- closedir(dir); +- } +- } else { +- // All CPUs are available, so we just arbitrarily choose CPU 0. +- retval = 0; +- } +- fclose(online_cpus); +-error: +- return retval; +-} +- +- +-static int _check_perf_events_cap(bool use_cpu_minus_one) +-{ +- /* If perf_events syscall is not implemented, the syscall below will fail +- * with ENOSYS (38). If implemented, but the processor type on which this +- * program is running is not supported by perf_events, the syscall returns +- * ENOENT (2). +- */ +- struct perf_event_attr attr; +- pid_t pid ; +- int cpu_to_try = use_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap(); +- errno = 0; +- memset(&attr, 0, sizeof(attr)); +- attr.size = sizeof(attr); +- attr.sample_type = PERF_SAMPLE_IP; +- +- pid = getpid(); +- syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0); +- return errno; +- +-} +- + static void _precheck_permissions_to_samplesdir(string sampledir, bool for_current) + { + /* Pre-check to make sure we have permission to remove old sample data +@@ -1911,28 +1320,14 @@ static void _precheck_permissions_to_samplesdir(string sampledir, bool for_curre + + } + +-static int _get_sys_value(const char * filename) +-{ +- char str[10]; +- int _val = -999; +- FILE * fp = fopen(filename, "r"); +- if (fp == NULL) +- return _val; +- if (fgets(str, 9, fp)) +- sscanf(str, "%d", &_val); +- fclose(fp); +- return _val; +-} +- +- + int main(int argc, char * const argv[]) + { + int rc; +- int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid"); ++ int perf_event_paranoid = op_get_sys_value("/proc/sys/kernel/perf_event_paranoid"); + + my_uid = geteuid(); + throttled = false; +- rc = _check_perf_events_cap(use_cpu_minus_one); ++ rc = op_check_perf_events_cap(use_cpu_minus_one); + if (rc == EACCES) { + /* Early perf_events kernels required the cpu argument to perf_event_open + * to be '-1' when setting up to profile a single process if 1) the user is +@@ -1948,7 +1343,7 @@ int main(int argc, char * const argv[]) + */ + if (my_uid != 0 && perf_event_paranoid > 0) { + use_cpu_minus_one = true; +- rc = _check_perf_events_cap(use_cpu_minus_one); ++ rc = op_check_perf_events_cap(use_cpu_minus_one); + } + } + if (rc == EBUSY) { +@@ -1996,7 +1391,7 @@ int main(int argc, char * const argv[]) + _precheck_permissions_to_samplesdir(previous_sampledir, for_current); + } + } +- kptr_restrict = _get_sys_value("/proc/sys/kernel/kptr_restrict"); ++ kptr_restrict = op_get_sys_value("/proc/sys/kernel/kptr_restrict"); + end_code_t run_result; + if ((run_result = _run())) { + if (startApp && app_started && (run_result != APP_ABNORMAL_END)) { diff --git a/SOURCES/oprofile-silvermont.patch b/SOURCES/oprofile-silvermont.patch new file mode 100644 index 0000000..3ddd04e --- /dev/null +++ b/SOURCES/oprofile-silvermont.patch @@ -0,0 +1,434 @@ +commit 4b1497d8befcc4c8b26dc4e4866c3422ae8787c3 +Author: Andi Kleen +Date: Thu Oct 10 13:12:28 2013 -0500 + + Add support for Intel Silvermont processor + + Just add the event list for Intel Silvermont based systems + (Avoton, BayTrail) and the usual changes for a new CPU. + No new code otherwise. + + The model number list is incomplete at this point, more will + be added in the future. + + I also finally removed the top level event list descriptions. + All the events are only described in the unit masks now + (Intel doesn't really have a top level event, and I had + to invent descriptions, which was error prone and + often wrong) + + I also removed some outdated document number references. + + Signed-off-by: Andi Kleen + +diff --git a/events/Makefile.am b/events/Makefile.am +index d91d44b..3028c2f 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -21,6 +21,7 @@ event_files = \ + i386/sandybridge/events i386/sandybridge/unit_masks \ + i386/ivybridge/events i386/ivybridge/unit_masks \ + i386/haswell/events i386/haswell/unit_masks \ ++ i386/silvermont/events i386/silvermont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ + ia64/itanium/events ia64/itanium/unit_masks \ +diff --git a/events/i386/silvermont/events b/events/i386/silvermont/events +new file mode 100644 +index 0000000..077cc0a +--- /dev/null ++++ b/events/i386/silvermont/events +@@ -0,0 +1,26 @@ ++# ++# Intel "Silvermont" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Silvermont based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++include:i386/arch_perfmon ++event:0x32 counters:0,1 um:l2_prefetcher_throttle minimum:200003 name:l2_prefetcher_throttle : ++event:0x3e counters:0,1 um:one minimum:200003 name:l2_prefetcher_pref_stream_alloc : ++event:0x50 counters:0,1 um:zero minimum:200003 name:l2_prefetch_pend_streams_pref_stream_pend_set : ++event:0x86 counters:0,1 um:nip_stall minimum:200003 name:nip_stall : ++event:0x87 counters:0,1 um:decode_stall minimum:200003 name:decode_stall : ++event:0x96 counters:0,1 um:uip_match minimum:200003 name:uip_match : ++event:0xc2 counters:0,1 um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:0,1 um:x10 minimum:200003 name:machine_clears_live_lock_breaker : ++event:0xc4 counters:0,1 um:br_inst_retired minimum:2000003 name:br_inst_retired : ++event:0xc5 counters:0,1 um:br_misp_retired minimum:200003 name:br_misp_retired : ++event:0xca counters:0,1 um:no_alloc_cycles minimum:200003 name:no_alloc_cycles : ++event:0xcb counters:0,1 um:rs_full_stall minimum:200003 name:rs_full_stall : ++event:0xcc counters:0,1 um:rs_dispatch_stall minimum:200003 name:rs_dispatch_stall : ++event:0xe6 counters:0,1 um:baclears minimum:2000003 name:baclears : ++event:0xe7 counters:0,1 um:x02 minimum:200003 name:ms_decoded_early_exit : ++event:0xe8 counters:0,1 um:one minimum:200003 name:btclears_all : ++event:0xe9 counters:0,1 um:decode_restriction minimum:200003 name:decode_restriction : +diff --git a/events/i386/silvermont/unit_masks b/events/i386/silvermont/unit_masks +new file mode 100644 +index 0000000..6309282 +--- /dev/null ++++ b/events/i386/silvermont/unit_masks +@@ -0,0 +1,71 @@ ++# ++# Unit masks for the Intel "Silvermont" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Silvermont based CPUs ++# ++include:i386/arch_perfmon ++name:x02 type:mandatory default:0x2 ++ 0x2 No unit mask ++name:x10 type:mandatory default:0x10 ++ 0x10 No unit mask ++name:l2_prefetcher_throttle type:exclusive default:0x2 ++ 0x2 extra:edge conservative Counts the number of cycles the L2 prefetcher spends in throttling mode ++ 0x1 extra:edge aggressive Counts the number of cycles the L2 prefetcher spends in throttling mode ++name:nip_stall type:exclusive default:0x3f ++ 0x3f extra: all Counts the number of cycles the NIP stalls. ++ 0x1 extra: pfb_full Counts the number of cycles the NIP stalls and the PFBs are full. This DOES NOT inlude PFB throttler cases. ++ 0x2 extra: itlb_miss Counts the number of cycles the NIP stalls and there is an outstanding ITLB miss. This is a cummulative count of cycles the NIP stalled for all ITLB misses. ++ 0x8 extra: pfb_throttler Counts the number of cycles the NIP stalls, the throttler is engaged, and the PFBs appear full. ++ 0x10 extra: do_snoop Counts the number of cycles the NIP stalls because of a SMC compliance snoop to the MEC is required. ++ 0x20 extra: misc_other Counts the number of cycles the NIP stalls due to NUKE, Stop Front End, Inserted flows. ++ 0x1e extra: pfb_ready Counts the number of cycles the NIP stalls when the PFBs are not full and the decoders are able to process bytes. Does not count PFB_FULL nor MISC_OTHER stall cycles. ++name:decode_stall type:exclusive default:0x1 ++ 0x1 extra: pfb_empty Counts the number of cycles decoder is stalled because the PFB is empty, this count is useful to see if the decoder is receiving the bytes from the front end. This event together with the DECODE_STALL.IQ_FULL may be used to narrow down on the bottleneck. ++ 0x2 extra: iq_full Counts the number of cycles decoder is stalled because the IQ is full, this count is useful to see if the decoder is delivering the decoded uops. This event together with the DECODE_STALL.PFB_EMPTY may be used to narrow down on the bottleneck. ++name:uip_match type:exclusive default:0x1 ++ 0x1 extra: first_uip This event is used for counting the number of times a specific micro IP address was decoded ++ 0x2 extra: second_uip This event is used for counting the number of times a specific micro IP address was decoded ++name:uops_retired type:exclusive default:0x2 ++ 0x2 extra: x87 This event counts the number of micro-ops retired that used X87 hardware. ++ 0x4 extra: mul This event counts the number of micro-ops retired that used MUL hardware. ++ 0x8 extra: div This event counts the number of micro-ops retired that used DIV hardware. ++ 0x1 extra: ms_cyles Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to faults, assists, and inserted flows. ++name:br_inst_retired type:exclusive default:0x1 ++ 0x1 extra: remove_jcc REMOVE_JCC counts the number of branch instructions retired but removes taken and not taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x2 extra: remove_rel_call REMOVE_REL_CALL counts the number of branch instructions retired but removes near relative CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x4 extra: remove_ind_call REMOVE_IND_CALL counts the number of branch instructions retired but removes near indirect CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x8 extra: remove_ret REMOVE_RET counts the number of branch instructions retired but removes near RET. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of branch instructions retired but removes near indirect JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x20 extra: remove_rel_jmp REMOVE_REL_JMP counts the number of branch instructions retired but removes near relative JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x40 extra: remove_far REMOVE_FAR counts the number of branch instructions retired but removes all far branches. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of branch instructions retired but removes taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++name:br_misp_retired type:exclusive default:0x1 ++ 0x1 extra: remove_jcc REMOVE_JCC counts the number of mispredicted branch instructions retired but removes taken and not taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x4 extra: remove_ind_call REMOVE_IND_CALL Counts the number of mispredicted branch instructions retired but removes near indirect CALL. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x8 extra: remove_ret REMOVE_RET Counts the number of mispredicted branch instructions retired but removes near RET. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of mispredicted branch instructions retired but removes near indirect JMP. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of mispredicted branch instructions retired but removes taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++name:no_alloc_cycles type:exclusive default:0x3f ++ 0x3f extra:inv all Counts the number of cycles that uops are allocated (inverse of NO_ALLOC_CYCLES.ALL) ++ 0x2 extra: sd_buffer_full Counts the number of cycles when no uops are allocated and the store data buffer is full. ++ 0x4 extra: mispredicts Counts the number of cycles when no uops are allocated and the alloc pipe is stalled waiting for a mispredicted jump to retire. After the misprediction is detected, the front end will start immediately but the allocate pipe stalls until the mispredicted ++ 0x8 extra: scoreboard Counts the number of cycles when no uops are allocated and a microcode IQ-based scoreboard stall is active. This includes stalls due to both the retirement scoreboard (at-ret) and micro-Jcc execution scoreboard (at-jeu). Does not count cycles when the MS ++ 0x10 extra: iq_empty Counts the number of cycles when no uops are allocated and the IQ is empty. Will assert immediately after a mispredict and partially overlap with MISPREDICTS sub event. ++name:rs_full_stall type:exclusive default:0x2 ++ 0x2 extra: iec_port0 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 0 is full. ++ 0x4 extra: iec_port1 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 1 is full. ++ 0x8 extra: fpc_port0 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 0 is full. ++ 0x10 extra: fpc_port1 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 1 is full. ++name:rs_dispatch_stall type:exclusive default:0x1 ++ 0x1 extra: iec0_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 0 of IEC RS while the RS had valid ops left to dispatch ++ 0x2 extra: iec1_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 1 of IEC RS while the RS had valid ops left to dispatch ++ 0x4 extra: fpc0_rs Counts cycles when no uops were disptached from port 0 of FPC RS while the RS had valid ops left to dispatch ++ 0x8 extra: fpc1_rs Counts cycles when no uops were disptached from port 1 of FPC RS while the RS had valid ops left to dispatch ++ 0x10 extra: mec_rs Counts cycles when no uops were dispatched from the MEC RS or rehab queue while valid ops were left to dispatch ++name:baclears type:exclusive default:0x2 ++ 0x2 extra: indirect Counts the number indirect branch baclears ++ 0x4 extra: uncond Counts the number unconditional branch baclears ++ 0x1e extra: no_corner_case sum of submasks [4:1]. Does not count special case baclears due to things like parity errors, bogus branches, and pd$ issues. ++name:decode_restriction type:exclusive default:0x1 ++ 0x1 extra: pdcache_wrong Counts the number of times a decode restriction reduced the decode throughput due to wrong instruction length prediction ++ 0x2 extra: all_3cycle_resteers Counts the number of times a decode restriction reduced the decode throughput because of all 3 cycle resteer conditions. Mainly PDCACHE_WRONG and MS_ENTRY cases. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index badb7ba..4bb34b7 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -127,6 +127,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "AMD64 generic", "x86-64/generic", CPU_AMD64_GENERIC, 4 }, + { "IBM Power Architected Events V1", "ppc64/architected_events_v1", CPU_PPC64_ARCH_V1, 6 }, + { "ppc64 POWER8", "ppc64/power8", CPU_PPC64_POWER8, 6 }, ++ { "Intel Silvermont microarchitecture", "i386/silvermont", CPU_SILVERMONT, 2 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -644,6 +645,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_ATOM: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 934fe9e..4703fa9 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -107,6 +107,7 @@ typedef enum { + CPU_AMD64_GENERIC, /**< AMD64 Generic */ + CPU_PPC64_ARCH_V1, /** < IBM Power architected events version 1 */ + CPU_PPC64_POWER8, /**< ppc64 POWER8 family */ ++ CPU_SILVERMONT, /** < Intel Silvermont microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 9d2aa5e..39c710d 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1201,6 +1201,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 6ae19bc..e86dcae 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -150,6 +150,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x46: + case 0x47: + return CPU_HASWELL; ++ case 0x37: ++ case 0x4d: ++ return CPU_SILVERMONT; + } + } + return cpu_type; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index 3b2896a..7543c6f 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -551,19 +551,20 @@ int main(int argc, char const * argv[]) + case CPU_CORE_I7: + case CPU_NEHALEM: + case CPU_HASWELL: ++ case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: + case CPU_ATOM: + event_doc = + "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" +- "Intel Architecture Optimization Reference Manual (730795-001)\n\n"; ++ "Intel Architecture Optimization Reference Manual\n\n"; + break; + + case CPU_ARCH_PERFMON: + event_doc = + "See Intel 64 and IA-32 Architectures Software Developer's Manual\n" +- "Volume 3B (Document 253669) Chapter 18 for architectural perfmon events\n" ++ "Volume 3B Chapter 18 for architectural perfmon events\n" + "This is a limited set of fallback events because oprofile doesn't know your CPU\n"; + break; + +commit 88779857662560604f85db608cf90f8609e1da6f +Author: Andi Kleen +Date: Thu Sep 11 09:00:52 2014 -0500 + + Update the Silvermont event files + + On further review the silvermont event files had a lot of problems. + I regenerated them completely. This fixes the PEBS events, and + fixes a range of others. + + The test suite passes without problems. + + Signed-off-by: Andi Kleen + +diff --git a/events/i386/silvermont/events b/events/i386/silvermont/events +index 077cc0a..434538f 100644 +--- a/events/i386/silvermont/events ++++ b/events/i386/silvermont/events +@@ -7,20 +7,18 @@ + # lowered in many cases without ill effect. + # + include:i386/arch_perfmon +-event:0x32 counters:0,1 um:l2_prefetcher_throttle minimum:200003 name:l2_prefetcher_throttle : +-event:0x3e counters:0,1 um:one minimum:200003 name:l2_prefetcher_pref_stream_alloc : +-event:0x50 counters:0,1 um:zero minimum:200003 name:l2_prefetch_pend_streams_pref_stream_pend_set : +-event:0x86 counters:0,1 um:nip_stall minimum:200003 name:nip_stall : +-event:0x87 counters:0,1 um:decode_stall minimum:200003 name:decode_stall : +-event:0x96 counters:0,1 um:uip_match minimum:200003 name:uip_match : ++event:0x03 counters:0,1 um:rehabq minimum:200003 name:rehabq : ++event:0x04 counters:0,1 um:mem_uops_retired minimum:200003 name:mem_uops_retired : ++event:0x05 counters:0,1 um:page_walks minimum:200003 name:page_walks : ++event:0x30 counters:0,1 um:zero minimum:200003 name:l2_reject_xq_all : ++event:0x31 counters:0,1 um:zero minimum:200003 name:core_reject_l2q_all : ++event:0x80 counters:0,1 um:icache minimum:200003 name:icache : + event:0xc2 counters:0,1 um:uops_retired minimum:2000003 name:uops_retired : +-event:0xc3 counters:0,1 um:x10 minimum:200003 name:machine_clears_live_lock_breaker : +-event:0xc4 counters:0,1 um:br_inst_retired minimum:2000003 name:br_inst_retired : ++event:0xc3 counters:0,1 um:machine_clears minimum:200003 name:machine_clears : ++event:0xc4 counters:0,1 um:br_inst_retired minimum:200003 name:br_inst_retired : + event:0xc5 counters:0,1 um:br_misp_retired minimum:200003 name:br_misp_retired : + event:0xca counters:0,1 um:no_alloc_cycles minimum:200003 name:no_alloc_cycles : + event:0xcb counters:0,1 um:rs_full_stall minimum:200003 name:rs_full_stall : +-event:0xcc counters:0,1 um:rs_dispatch_stall minimum:200003 name:rs_dispatch_stall : +-event:0xe6 counters:0,1 um:baclears minimum:2000003 name:baclears : +-event:0xe7 counters:0,1 um:x02 minimum:200003 name:ms_decoded_early_exit : +-event:0xe8 counters:0,1 um:one minimum:200003 name:btclears_all : +-event:0xe9 counters:0,1 um:decode_restriction minimum:200003 name:decode_restriction : ++event:0xcd counters:0,1 um:one minimum:2000003 name:cycles_div_busy_all : ++event:0xe6 counters:0,1 um:baclears minimum:200003 name:baclears : ++event:0xe7 counters:0,1 um:one minimum:200003 name:ms_decoded_ms_entry : +diff --git a/events/i386/silvermont/unit_masks b/events/i386/silvermont/unit_masks +index 6309282..c0dac26 100644 +--- a/events/i386/silvermont/unit_masks ++++ b/events/i386/silvermont/unit_masks +@@ -4,68 +4,86 @@ + # See http://ark.intel.com/ for help in identifying Silvermont based CPUs + # + include:i386/arch_perfmon +-name:x02 type:mandatory default:0x2 +- 0x2 No unit mask +-name:x10 type:mandatory default:0x10 +- 0x10 No unit mask +-name:l2_prefetcher_throttle type:exclusive default:0x2 +- 0x2 extra:edge conservative Counts the number of cycles the L2 prefetcher spends in throttling mode +- 0x1 extra:edge aggressive Counts the number of cycles the L2 prefetcher spends in throttling mode +-name:nip_stall type:exclusive default:0x3f +- 0x3f extra: all Counts the number of cycles the NIP stalls. +- 0x1 extra: pfb_full Counts the number of cycles the NIP stalls and the PFBs are full. This DOES NOT inlude PFB throttler cases. +- 0x2 extra: itlb_miss Counts the number of cycles the NIP stalls and there is an outstanding ITLB miss. This is a cummulative count of cycles the NIP stalled for all ITLB misses. +- 0x8 extra: pfb_throttler Counts the number of cycles the NIP stalls, the throttler is engaged, and the PFBs appear full. +- 0x10 extra: do_snoop Counts the number of cycles the NIP stalls because of a SMC compliance snoop to the MEC is required. +- 0x20 extra: misc_other Counts the number of cycles the NIP stalls due to NUKE, Stop Front End, Inserted flows. +- 0x1e extra: pfb_ready Counts the number of cycles the NIP stalls when the PFBs are not full and the decoders are able to process bytes. Does not count PFB_FULL nor MISC_OTHER stall cycles. +-name:decode_stall type:exclusive default:0x1 +- 0x1 extra: pfb_empty Counts the number of cycles decoder is stalled because the PFB is empty, this count is useful to see if the decoder is receiving the bytes from the front end. This event together with the DECODE_STALL.IQ_FULL may be used to narrow down on the bottleneck. +- 0x2 extra: iq_full Counts the number of cycles decoder is stalled because the IQ is full, this count is useful to see if the decoder is delivering the decoded uops. This event together with the DECODE_STALL.PFB_EMPTY may be used to narrow down on the bottleneck. +-name:uip_match type:exclusive default:0x1 +- 0x1 extra: first_uip This event is used for counting the number of times a specific micro IP address was decoded +- 0x2 extra: second_uip This event is used for counting the number of times a specific micro IP address was decoded +-name:uops_retired type:exclusive default:0x2 +- 0x2 extra: x87 This event counts the number of micro-ops retired that used X87 hardware. +- 0x4 extra: mul This event counts the number of micro-ops retired that used MUL hardware. +- 0x8 extra: div This event counts the number of micro-ops retired that used DIV hardware. +- 0x1 extra: ms_cyles Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to faults, assists, and inserted flows. +-name:br_inst_retired type:exclusive default:0x1 +- 0x1 extra: remove_jcc REMOVE_JCC counts the number of branch instructions retired but removes taken and not taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x2 extra: remove_rel_call REMOVE_REL_CALL counts the number of branch instructions retired but removes near relative CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x4 extra: remove_ind_call REMOVE_IND_CALL counts the number of branch instructions retired but removes near indirect CALL. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x8 extra: remove_ret REMOVE_RET counts the number of branch instructions retired but removes near RET. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of branch instructions retired but removes near indirect JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x20 extra: remove_rel_jmp REMOVE_REL_JMP counts the number of branch instructions retired but removes near relative JMP. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x40 extra: remove_far REMOVE_FAR counts the number of branch instructions retired but removes all far branches. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +- 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of branch instructions retired but removes taken conditional branches (JCC). Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. +-name:br_misp_retired type:exclusive default:0x1 +- 0x1 extra: remove_jcc REMOVE_JCC counts the number of mispredicted branch instructions retired but removes taken and not taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x4 extra: remove_ind_call REMOVE_IND_CALL Counts the number of mispredicted branch instructions retired but removes near indirect CALL. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x8 extra: remove_ret REMOVE_RET Counts the number of mispredicted branch instructions retired but removes near RET. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x10 extra: remove_ind_jmp REMOVE_IND_JMP counts the number of mispredicted branch instructions retired but removes near indirect JMP. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. +- 0x80 extra: remove_not_taken_jcc REMOVE_NOT_TAKEN_JCC counts the number of mispredicted branch instructions retired but removes taken conditional branches (JCC). This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++name:rehabq type:exclusive default:0x1 ++ 0x1 extra: ld_block_st_forward This event counts the number of retired loads that were prohibited from receiving forwarded data from the store because of address mismatch. ++ 0x1 extra:pebs ld_block_st_forward_pebs This event counts the number of retired loads that were prohibited from receiving forwarded data from the store because of address mismatch. ++ 0x2 extra: ld_block_std_notready This event counts the cases where a forward was technically possible, but did not occur because the store data was not available at the right time ++ 0x4 extra: st_splits This event counts the number of retire stores that experienced cache line boundary splits ++ 0x8 extra: ld_splits This event counts the number of retire loads that experienced cache line boundary splits ++ 0x8 extra:pebs ld_splits_pebs This event counts the number of retire loads that experienced cache line boundary splits ++ 0x10 extra: lock This event counts the number of retired memory operations with lock semantics. These are either implicit locked instructions such as the XCHG instruction or instructions with an explicit LOCK prefix (0xF0). ++ 0x20 extra: sta_full This event counts the number of retired stores that are delayed because there is not a store address buffer available. ++ 0x40 extra: any_ld This event counts the number of load uops reissued from Rehabq ++ 0x80 extra: any_st This event counts the number of store uops reissued from Rehabq ++name:mem_uops_retired type:exclusive default:0x1 ++ 0x1 extra: l1_miss_loads This event counts the number of load ops retired that miss in L1 Data cache. Note that prefetch misses will not be counted. ++ 0x2 extra: l2_hit_loads This event counts the number of load ops retired that hit in the L2 ++ 0x2 extra:pebs l2_hit_loads_pebs This event counts the number of load ops retired that hit in the L2 ++ 0x4 extra: l2_miss_loads This event counts the number of load ops retired that miss in the L2 ++ 0x4 extra:pebs l2_miss_loads_pebs This event counts the number of load ops retired that miss in the L2 ++ 0x8 extra: dtlb_miss_loads This event counts the number of load ops retired that had DTLB miss. ++ 0x8 extra:pebs dtlb_miss_loads_pebs This event counts the number of load ops retired that had DTLB miss. ++ 0x10 extra: utlb_miss This event counts the number of load ops retired that had UTLB miss. ++ 0x20 extra: hitm This event counts the number of load ops retired that got data from the other core or from the other module. ++ 0x20 extra:pebs hitm_pebs This event counts the number of load ops retired that got data from the other core or from the other module. ++ 0x40 extra: all_loads This event counts the number of load ops retired ++ 0x80 extra: all_stores This event counts the number of store ops retired ++name:page_walks type:exclusive default:0x1 ++ 0x1 extra:edge d_side_walks This event counts when a data (D) page walk is completed or started. Since a page walk implies a TLB miss, the number of TLB misses can be counted by counting the number of pagewalks. ++ 0x1 extra: d_side_cycles This event counts every cycle when a D-side (walks due to a load) page walk is in progress. Page walk duration divided by number of page walks is the average duration of page-walks. ++ 0x2 extra:edge i_side_walks This event counts when an instruction (I) page walk is completed or started. Since a page walk implies a TLB miss, the number of TLB misses can be counted by counting the number of pagewalks. ++ 0x2 extra: i_side_cycles This event counts every cycle when a I-side (walks due to an instruction fetch) page walk is in progress. Page walk duration divided by number of page walks is the average duration of page-walks. ++ 0x3 extra:edge walks This event counts when a data (D) page walk or an instruction (I) page walk is completed or started. Since a page walk implies a TLB miss, the number of TLB misses can be counted by counting the number of pagewalks. ++ 0x3 extra: cycles This event counts every cycle when a data (D) page walk or instruction (I) page walk is in progress. Since a pagewalk implies a TLB miss, the approximate cost of a TLB miss can be determined from this event. ++name:icache type:exclusive default:0x3 ++ 0x3 extra: accesses This event counts all instruction fetches, including uncacheable fetches. ++ 0x1 extra: hit This event counts all instruction fetches from the instruction cache. ++ 0x2 extra: misses This event counts all instruction fetches that miss the Instruction cache or produce memory requests. This includes uncacheable fetches. An instruction fetch miss is counted only once and not once for every cycle it is outstanding. ++name:uops_retired type:exclusive default:0x10 ++ 0x10 extra: all This event counts the number of micro-ops retired. The processor decodes complex macro instructions into a sequence of simpler micro-ops. Most instructions are composed of one or two micro-ops. Some instructions are decoded into longer sequences such as repeat instructions, floating point transcendental instructions, and assists. In some cases micro-op sequences are fused or whole instructions are fused into one micro-op. See other UOPS_RETIRED events for differentiating retired fused and non-fused micro-ops. ++ 0x1 extra: ms This event counts the number of micro-ops retired that were supplied from MSROM. ++name:machine_clears type:exclusive default:0x8 ++ 0x8 extra: all Machine clears happen when something happens in the machine that causes the hardware to need to take special care to get the right answer. When such a condition is signaled on an instruction, the front end of the machine is notified that it must restart, so no more instructions will be decoded from the current path. All instructions "older" than this one will be allowed to finish. This instruction and all "younger" instructions must be cleared, since they must not be allowed to complete. Essentially, the hardware waits until the problematic instruction is the oldest instruction in the machine. This means all older instructions are retired, and all pending stores (from older instructions) are completed. Then the new path of instructions from the front end are allowed to start into the machine. There are many conditions that might cause a machine clear (including the receipt of an interrupt, or a trap or a fault). All those conditions (including but not limited to MACHINE_CLEARS.MEMORY_ORDERING, MACHINE_CLEARS.SMC, and MACHINE_CLEARS.FP_ASSIST) are captured in the ANY event. In addition, some conditions can be specifically counted (i.e. SMC, MEMORY_ORDERING, FP_ASSIST). However, the sum of SMC, MEMORY_ORDERING, and FP_ASSIST machine clears will not necessarily equal the number of ANY. ++ 0x1 extra: smc This event counts the number of times that a program writes to a code section. Self-modifying code causes a severe penalty in all Intel? architecture processors. ++ 0x2 extra: memory_ordering This event counts the number of times that pipeline was cleared due to memory ordering issues. ++ 0x4 extra: fp_assist This event counts the number of times that pipeline stalled due to FP operations needing assists. ++name:br_inst_retired type:exclusive default:0x7e ++ 0x7e extra: jcc JCC counts the number of conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0x7e extra:pebs jcc_pebs JCC counts the number of conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfe extra: taken_jcc TAKEN_JCC counts the number of taken conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfe extra:pebs taken_jcc_pebs TAKEN_JCC counts the number of taken conditional branch (JCC) instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf9 extra: call CALL counts the number of near CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf9 extra:pebs call_pebs CALL counts the number of near CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfd extra: rel_call REL_CALL counts the number of near relative CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfd extra:pebs rel_call_pebs REL_CALL counts the number of near relative CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfb extra: ind_call IND_CALL counts the number of near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xfb extra:pebs ind_call_pebs IND_CALL counts the number of near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf7 extra: return RETURN counts the number of near RET branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xf7 extra:pebs return_pebs RETURN counts the number of near RET branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xeb extra: non_return_ind NON_RETURN_IND counts the number of near indirect JMP and near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xeb extra:pebs non_return_ind_pebs NON_RETURN_IND counts the number of near indirect JMP and near indirect CALL branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xbf extra: far_branch FAR counts the number of far branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++ 0xbf extra:pebs far_branch_pebs FAR counts the number of far branch instructions retired. Branch prediction predicts the branch target and enables the processor to begin executing instructions long before the branch true execution path is known. All branches utilize the branch prediction unit (BPU) for prediction. This unit predicts the target address not only based on the EIP of the branch but also based on the execution path through which execution reached this EIP. The BPU can efficiently predict the following branch types: conditional branches, direct calls and jumps, indirect calls and jumps, returns. ++name:br_misp_retired type:exclusive default:0x7e ++ 0x7e extra: jcc JCC counts the number of mispredicted conditional branches (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0x7e extra:pebs jcc_pebs JCC counts the number of mispredicted conditional branches (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfe extra: taken_jcc TAKEN_JCC counts the number of mispredicted taken conditional branch (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfe extra:pebs taken_jcc_pebs TAKEN_JCC counts the number of mispredicted taken conditional branch (JCC) instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfb extra: ind_call IND_CALL counts the number of mispredicted near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xfb extra:pebs ind_call_pebs IND_CALL counts the number of mispredicted near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xf7 extra: return RETURN counts the number of mispredicted near RET branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xf7 extra:pebs return_pebs RETURN counts the number of mispredicted near RET branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xeb extra: non_return_ind NON_RETURN_IND counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. ++ 0xeb extra:pebs non_return_ind_pebs NON_RETURN_IND counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired. This event counts the number of retired branch instructions that were mispredicted by the processor, categorized by type. A branch misprediction occurs when the processor predicts that the branch would be taken, but it is not, or vice-versa. When the misprediction is discovered, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. + name:no_alloc_cycles type:exclusive default:0x3f +- 0x3f extra:inv all Counts the number of cycles that uops are allocated (inverse of NO_ALLOC_CYCLES.ALL) +- 0x2 extra: sd_buffer_full Counts the number of cycles when no uops are allocated and the store data buffer is full. +- 0x4 extra: mispredicts Counts the number of cycles when no uops are allocated and the alloc pipe is stalled waiting for a mispredicted jump to retire. After the misprediction is detected, the front end will start immediately but the allocate pipe stalls until the mispredicted +- 0x8 extra: scoreboard Counts the number of cycles when no uops are allocated and a microcode IQ-based scoreboard stall is active. This includes stalls due to both the retirement scoreboard (at-ret) and micro-Jcc execution scoreboard (at-jeu). Does not count cycles when the MS +- 0x10 extra: iq_empty Counts the number of cycles when no uops are allocated and the IQ is empty. Will assert immediately after a mispredict and partially overlap with MISPREDICTS sub event. +-name:rs_full_stall type:exclusive default:0x2 +- 0x2 extra: iec_port0 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 0 is full. +- 0x4 extra: iec_port1 Counts the number of cycles the Alloc pipeline is stalled because IEC RS for port 1 is full. +- 0x8 extra: fpc_port0 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 0 is full. +- 0x10 extra: fpc_port1 Counts the number of cycles the Alloc pipeline is stalled because FPC RS for port 1 is full. +-name:rs_dispatch_stall type:exclusive default:0x1 +- 0x1 extra: iec0_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 0 of IEC RS while the RS had valid ops left to dispatch +- 0x2 extra: iec1_rs *COUNTER BROKEN - NO FIX* Counts cycles when no uops were disptached from port 1 of IEC RS while the RS had valid ops left to dispatch +- 0x4 extra: fpc0_rs Counts cycles when no uops were disptached from port 0 of FPC RS while the RS had valid ops left to dispatch +- 0x8 extra: fpc1_rs Counts cycles when no uops were disptached from port 1 of FPC RS while the RS had valid ops left to dispatch +- 0x10 extra: mec_rs Counts cycles when no uops were dispatched from the MEC RS or rehab queue while valid ops were left to dispatch +-name:baclears type:exclusive default:0x2 +- 0x2 extra: indirect Counts the number indirect branch baclears +- 0x4 extra: uncond Counts the number unconditional branch baclears +- 0x1e extra: no_corner_case sum of submasks [4:1]. Does not count special case baclears due to things like parity errors, bogus branches, and pd$ issues. +-name:decode_restriction type:exclusive default:0x1 +- 0x1 extra: pdcache_wrong Counts the number of times a decode restriction reduced the decode throughput due to wrong instruction length prediction +- 0x2 extra: all_3cycle_resteers Counts the number of times a decode restriction reduced the decode throughput because of all 3 cycle resteer conditions. Mainly PDCACHE_WRONG and MS_ENTRY cases. ++ 0x3f extra: all The NO_ALLOC_CYCLES.ALL event counts the number of cycles when the front-end does not provide any instructions to be allocated for any reason. This event indicates the cycles where an allocation stalls occurs, and no UOPS are allocated in that cycle. ++ 0x1 extra: rob_full Counts the number of cycles when no uops are allocated and the ROB is full (less than 2 entries available) ++ 0x20 extra: rat_stall Counts the number of cycles when no uops are allocated and a RATstall is asserted. ++ 0x50 extra: not_delivered The NO_ALLOC_CYCLES.NOT_DELIVERED event is used to measure front-end inefficiencies, i.e. when front-end of the machine is not delivering micro-ops to the back-end and the back-end is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into micro-ops (uops) in machine understandable format and putting them into a micro-op queue to be consumed by back end. The back-end then takes these micro-ops, allocates the required resources. When all resources are ready, micro-ops are executed. If the back-end is not ready to accept micro-ops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more UOPS. This event counts the cycles only when back-end is requesting more uops and front-end is not able to provide them. Some examples of conditions that cause front-end efficiencies are: Icache misses, ITLB misses, and decoder restrictions that limit the the front-end bandwidth. ++name:rs_full_stall type:exclusive default:0x1f ++ 0x1f extra: all Counts the number of cycles the Alloc pipeline is stalled when any one of the RSs (IEC, FPC and MEC) is full. This event is a superset of all the individual RS stall event counts. ++ 0x1 extra: mec Counts the number of cycles and allocation pipeline is stalled and is waiting for a free MEC reservation station entry. The cycles should be appropriately counted in case of the cracked ops e.g. In case of a cracked load-op, the load portion is sent to M ++name:baclears type:exclusive default:0x1 ++ 0x1 extra: all The BACLEARS event counts the number of times the front end is resteered, mainly when the Branch Prediction Unit cannot provide a correct prediction and this is corrected by the Branch Address Calculator at the front end. The BACLEARS.ANY event counts the number of baclears for any type of branch. ++ 0x8 extra: return The BACLEARS event counts the number of times the front end is resteered, mainly when the Branch Prediction Unit cannot provide a correct prediction and this is corrected by the Branch Address Calculator at the front end. The BACLEARS.RETURN event counts the number of RETURN baclears. ++ 0x10 extra: cond The BACLEARS event counts the number of times the front end is resteered, mainly when the Branch Prediction Unit cannot provide a correct prediction and this is corrected by the Branch Address Calculator at the front end. The BACLEARS.COND event counts the number of JCC (Jump on Condtional Code) baclears. diff --git a/SOURCES/oprofile-xml.patch b/SOURCES/oprofile-xml.patch new file mode 100644 index 0000000..abddd48 --- /dev/null +++ b/SOURCES/oprofile-xml.patch @@ -0,0 +1,236 @@ +diff -up oprofile-0.9.9/doc/ophelp.xsd.ophelp oprofile-0.9.9/doc/ophelp.xsd +--- oprofile-0.9.9/doc/ophelp.xsd.ophelp 2014-05-28 10:09:46.279270117 -0400 ++++ oprofile-0.9.9/doc/ophelp.xsd 2014-05-28 10:08:59.416060557 -0400 +@@ -0,0 +1,57 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +commit a339a069f4ceba748df44d2babd9f08ce06abd78 +Author: Maynard Johnson +Date: Thu Nov 7 08:24:05 2013 -0600 + + ophelp schema is not included in installed files + + A one-line change in doc/Makefile.am was needed in order for + 'make install' to put ophelp.xsd in /share/doc/oprofile. + + Signed-off-by: Maynard Johnson + +diff --git a/doc/Makefile.am b/doc/Makefile.am +index 45fbe92..258842f 100644 +--- a/doc/Makefile.am ++++ b/doc/Makefile.am +@@ -26,7 +26,7 @@ man_MANS += operf.1 \ + endif + + htmldir = $(prefix)/share/doc/oprofile +-dist_html_DATA = oprofile.html internals.html opreport.xsd op-jit-devel.html ++dist_html_DATA = oprofile.html internals.html opreport.xsd ophelp.xsd op-jit-devel.html + + if have_xsltproc + + +commit ed40d8d444a17e7cf16a4653607b04a24f4c0513 +Author: William Cohen +Date: Tue Jan 28 11:05:46 2014 -0600 + + Print unit mask name where applicable in ophelp XML output + + Some Intel architectures have named unit masks and it would be useful + to include the unit mask name in the XML output. This patch also + updates the ophelp.xsd schema file to include the optional unit + mask 'name' field. + + Signed-off-by: William Cohen + +diff --git a/doc/ophelp.xsd b/doc/ophelp.xsd +index 9bd7f82..c07bdb4 100644 +--- a/doc/ophelp.xsd ++++ b/doc/ophelp.xsd +@@ -49,6 +49,7 @@ + + + ++ + + + +diff --git a/libop/op_xml_events.c b/libop/op_xml_events.c +index 3b1af21..de107c2 100644 +--- a/libop/op_xml_events.c ++++ b/libop/op_xml_events.c +@@ -95,6 +95,10 @@ void xml_help_for_event(struct op_event const * event) + close_xml_element(NONE, 1, buffer, MAX_BUFFER); + for (i = 0; i < event->unit->num; i++) { + open_xml_element(HELP_UNIT_MASK, 1, buffer, MAX_BUFFER); ++ if (event->unit->um[i].name) ++ init_xml_str_attr(HELP_UNIT_MASK_NAME, ++ event->unit->um[i].name, ++ buffer, MAX_BUFFER); + init_xml_int_attr(HELP_UNIT_MASK_VALUE, + event->unit->um[i].value, + buffer, MAX_BUFFER); +diff --git a/libop/op_xml_out.c b/libop/op_xml_out.c +index 0b3deea..ac3c97b 100644 +--- a/libop/op_xml_out.c ++++ b/libop/op_xml_out.c +@@ -84,7 +84,8 @@ char const * xml_tag_map[] = { + "unit_mask", + "mask", + "desc", +- "extra" ++ "extra", ++ "name" + }; + + #define MAX_BUF_LEN 2048 +diff --git a/libop/op_xml_out.h b/libop/op_xml_out.h +index 544bd51..6d5a468 100644 +--- a/libop/op_xml_out.h ++++ b/libop/op_xml_out.h +@@ -59,6 +59,7 @@ typedef enum { + HELP_UNIT_MASK_VALUE, + HELP_UNIT_MASK_DESC, + HELP_UNIT_EXTRA_VALUE, ++ HELP_UNIT_MASK_NAME, + } tag_t; + + char const * xml_tag_name(tag_t tag); +commit fd05dade355b482ee9286b7bf90b4b150f49f81c +Author: Maynard Johnson +Date: Mon Feb 3 08:47:30 2014 -0600 + + Remove 'extra' attribute from ophelp XML output; bump schema version + + As discussed on the oprofile mailing list on Sep 24, 2013, there is + no value add in keeping the 'extra' attribute in ophelp's XML output. + The previous commit added the 'name' field to the XML output, and + that is actual valuable information that consumers of the XML output + should use when coding event specifications to pass to operf or + ocount. + + This patch removes the 'extra' attribute and also bumps the schema + version (both in the ophelp.xsd and the XML instance documents). + The schema bump is needed mostly due to removing the 'extra' attribute; + but another reason for it is to draw attention to the new 'name' + attribute, which consumers really must use (when present) in order + to be sure they can properly specify the unitmask that the user + requests. + + Signed-off-by: Maynard Johnson + +diff --git a/doc/ophelp.xsd b/doc/ophelp.xsd +index c07bdb4..1270121 100644 +--- a/doc/ophelp.xsd ++++ b/doc/ophelp.xsd +@@ -11,7 +11,7 @@ + + + +- ++ + + + +diff --git a/libop/op_xml_events.c b/libop/op_xml_events.c +index de107c2..c301732 100644 +--- a/libop/op_xml_events.c ++++ b/libop/op_xml_events.c +@@ -21,7 +21,7 @@ static char buffer[MAX_BUFFER]; + + void open_xml_events(char const * title, char const * doc, op_cpu the_cpu_type) + { +- char const * schema_version = "1.1"; ++ char const * schema_version = "2.0"; + + buffer[0] = '\0'; + cpu_type = the_cpu_type; +@@ -105,10 +105,6 @@ void xml_help_for_event(struct op_event const * event) + init_xml_str_attr(HELP_UNIT_MASK_DESC, + event->unit->um[i].desc, + buffer, MAX_BUFFER); +- if (event->unit->um[i].extra) +- init_xml_int_attr(HELP_UNIT_EXTRA_VALUE, +- event->unit->um[i].extra, +- buffer, MAX_BUFFER); + close_xml_element(NONE, 0, buffer, MAX_BUFFER); + } + close_xml_element(HELP_UNIT_MASKS, 0, buffer, MAX_BUFFER); +diff --git a/libop/op_xml_out.c b/libop/op_xml_out.c +index ac3c97b..63ee41c 100644 +--- a/libop/op_xml_out.c ++++ b/libop/op_xml_out.c +@@ -84,7 +84,6 @@ char const * xml_tag_map[] = { + "unit_mask", + "mask", + "desc", +- "extra", + "name" + }; + +diff --git a/libop/op_xml_out.h b/libop/op_xml_out.h +index 6d5a468..a829f66 100644 +--- a/libop/op_xml_out.h ++++ b/libop/op_xml_out.h +@@ -58,7 +58,6 @@ typedef enum { + HELP_UNIT_MASK, + HELP_UNIT_MASK_VALUE, + HELP_UNIT_MASK_DESC, +- HELP_UNIT_EXTRA_VALUE, + HELP_UNIT_MASK_NAME, + } tag_t; + diff --git a/SPECS/oprofile.spec b/SPECS/oprofile.spec index ddc9732..93c68e9 100644 --- a/SPECS/oprofile.spec +++ b/SPECS/oprofile.spec @@ -1,7 +1,7 @@ Summary: System wide profiler Name: oprofile Version: 0.9.9 -Release: 4%{?dist} +Release: 7%{?dist} License: GPLv2+ and LGPLv2+ Group: Development/System # @@ -12,6 +12,16 @@ Requires(pre): shadow-utils Requires(postun): shadow-utils Patch10: oprofile-0.4-guess2.patch Patch83: oprofile-0.9.7-xen.patch +Patch303: oprofile-num_symbolic.patch +Patch304: oprofile-xml.patch +Patch305: oprofile-rhbz1121205.patch +Patch400: oprofile-haswell.patch +Patch401: oprofile-silvermont.patch +Patch402: oprofile-broadwell.patch +Patch500: oprofile-aarch64.patch +Patch600: oprofile-power8.patch +Patch601: oprofile-ppc64le.patch +Patch602: oprofile-ppc64-equivalent.patch URL: http://oprofile.sf.net @@ -76,6 +86,16 @@ agent library. %setup -q -n %{name}-%{version} %patch10 -p1 -b .guess2 %patch83 -p1 -b .xen +%patch303 -p1 -b .num_symbolic +%patch304 -p1 -b .xml +%patch305 -p1 -b .xml +%patch400 -p1 -b .haswell +%patch401 -p1 -b .silvermont +%patch402 -p1 -b .broadwell +%patch500 -p1 -b .aarch64 +%patch600 -p1 -b .power8 +%patch601 -p1 -b .ppc64le +%patch602 -p1 ./autogen.sh @@ -162,6 +182,20 @@ exit 0 %{_sysconfdir}/ld.so.conf.d/* %changelog +* Wed Oct 1 2014 Will Cohen - 0.9.9-7 +- Correct identification power8le. rhbz1148525 + +* Wed Sep 17 2014 Will Cohen - 0.9.9-6 +- Update support for Intel Silvermont (Avoton). +- Enable configure for ppc64le. + +* Mon Aug 18 2014 Will Cohen - 0.9.9-5 +- Update Intel Haswell events. +- Add support for Intel Silvermont (Avoton). +- Add support for Intel Broadwell. +- Add support for aarch64. +- Update IBM power8 events. + * Fri Jan 24 2014 Daniel Mach - 0.9.9-4 - Mass rebuild 2014-01-24