Blob Blame History Raw
commit a99127699330dce984dba38156230ab3584d0d6e
Author: William Cohen <wcohen@redhat.com>
Date:   Mon Nov 30 17:13:32 2015 -0500

    Make Intel Westmere and Nehalem event names unique
    
    The Intel Westmere and Nehalem event lists each had two events named
    MACRO_INSTS.  The event names in the event lists need to be unique.
    The event refererring to the Macro-fused instructions decoded (0xa6)
    has been renamed MACRO_INSTS_FUSED to avoid the name collision with
    MACRO_INSTS.
    
    Signed-off-by: William Cohen <wcohen@redhat.com>

diff --git a/events/i386/nehalem/events b/events/i386/nehalem/events
index 31a08b6..6951f35 100644
--- a/events/i386/nehalem/events
+++ b/events/i386/nehalem/events
@@ -68,7 +68,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:6000 name:ILD_STALL : Cycles In
 event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:6000 name:BR_INST_EXEC : Counts the number of near branch instructions executed, but not necessarily retired.
 event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:6000 name:BR_MISP_EXEC : Counts the number of mispredicted conditional near branch instructions executed, but not necessarily retired.
 event:0xA2 counters:0,1,2,3 um:resource_stalls minimum:6000 name:RESOURCE_STALLS : Counts the number of Allocator resource related stalls. Includes register renaming buffer entries, memory buffer entries. In addition to resource related stalls, this event counts some other events. Includes stalls arising during branch misprediction recovery, such as if retirement of the mispredicted branch is delayed and stalls arising while store buffer is draining from synchronizing operations.
-event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired.
+event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS_FUSED : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired.
 event:0xA7 counters:0,1,2,3 um:one minimum:6000 name:BACLEAR_FORCE_IQ : Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ is also responsible for providing conditional branch prediciton direction based on a static scheme and dynamic data provided by the L2 Branch Prediction Unit. If the conditional branch target is not found in the Target Array and the IQ predicts that the branch is taken, then the IQ will force the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble in the instruction fetch pipeline.
 event:0xA8 counters:0,1,2,3 um:one minimum:6000 name:LSD : Counts the number of micro-ops delivered by loop stream detector
 event:0xAE counters:0,1,2,3 um:one minimum:6000 name:ITLB_FLUSH : Counts the number of ITLB flushes
diff --git a/events/i386/westmere/events b/events/i386/westmere/events
index d919867..d7b2064 100644
--- a/events/i386/westmere/events
+++ b/events/i386/westmere/events
@@ -48,7 +48,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:2000000 name:ILD_STALL : Any In
 event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:200000 name:BR_INST_EXEC : Branch instructions executed
 event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:20000 name:BR_MISP_EXEC : Mispredicted branches executed
 event:0xa2 counters:0,1,2,3 um:resource_stalls minimum:2000000 name:RESOURCE_STALLS : Resource related stall cycles
-event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS : Macro-fused instructions decoded
+event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS_FUSED : Macro-fused instructions decoded
 event:0xa7 counters:0,1,2,3 um:x01 minimum:2000000 name:BACLEAR_FORCE_IQ : Instruction queue forced BACLEAR
 event:0xa8 counters:0,1,2,3 um:x01 minimum:2000000 name:LSD : Cycles when uops were delivered by the LSD
 event:0xae counters:0,1,2,3 um:x01 minimum:2000000 name:ITLB_FLUSH : ITLB flushes
commit dc9076e99c9afada60cbe81dd43772cb72ec509d
Author: Michael Petlan <mpetlan@redhat.com>
Date:   Thu Apr 30 10:34:48 2015 -0400

    Fix default unit masks for Haswells
    
    Since some of the default unit masks for Haswell events cannot be
    uniquely specified by numbers, the defaults have had to be replaced
    by the named ones. When the affected events are used on Haswell without
    specifying unit masks after applying this patch, the default masks
    are chosen correctly.
    
    Signed-off-by: Michael Petlan <mpetlan@redhat.com>

diff --git a/events/i386/haswell/unit_masks b/events/i386/haswell/unit_masks
index 60c2a61..9b4be33 100644
--- a/events/i386/haswell/unit_masks
+++ b/events/i386/haswell/unit_masks
@@ -32,7 +32,7 @@ name:dtlb_load_misses type:exclusive default:0x1
 	0x80 extra: pde_cache_miss DTLB demand load misses with low part of linear-to-physical address translation missed
 	0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size.
 	0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks
-name:uops_issued type:exclusive default:0x1
+name:uops_issued type:exclusive default:any
 	0x1 extra: any This event counts the number of uops issued by the Front-end of the pipeline to the Back-end. This event is counted at the allocation stage and will count both retired and non-retired uops.
 	0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch.
 	0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not.
@@ -56,7 +56,7 @@ name:l2_rqsts type:exclusive default:0x21
 	0xe7 extra: all_demand_references Demand requests to L2 cache
 	0x3f extra: miss All requests that miss L2 cache
 	0xff extra: references All L2 requests
-name:l1d_pend_miss type:exclusive default:0x1
+name:l1d_pend_miss type:exclusive default:pending
 	0x1 extra: pending L1D miss oustandings duration in cycles
 	0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding.
 name:dtlb_store_misses type:exclusive default:0x1
@@ -85,7 +85,7 @@ name:move_elimination type:exclusive default:0x1
 	0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated.
 	0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated.
 	0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated.
-name:cpl_cycles type:exclusive default:0x1
+name:cpl_cycles type:exclusive default:ring0
 	0x1 extra: ring0 Unhalted core cycles when the thread is in ring 0
 	0x2 extra: ring123 Unhalted core cycles when thread is in rings 1, 2, or 3
 	0x1 extra:cmask=1,edge ring0_trans Number of intervals between processor halts while thread is in ring 0
@@ -95,10 +95,10 @@ name:tx_exec type:exclusive default:0x1
 	0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded
 	0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region.
 	0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region
-name:rs_events type:exclusive default:0x1
+name:rs_events type:exclusive default:empty_cycles
 	0x1 extra: empty_cycles This event counts cycles when the Reservation Station ( RS ) is empty for the thread. The RS is a structure that buffers allocated micro-ops from the Front-end. If there are many cycles when the RS is empty, it may represent an underflow of instructions delivered from the Front-end.
 	0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues.
-name:offcore_requests_outstanding type:exclusive default:0x1
+name:offcore_requests_outstanding type:exclusive default:demand_data_rd
 	0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue.
 	0x2 extra: demand_code_rd Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle
 	0x4 extra: demand_rfo Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore
@@ -164,14 +164,14 @@ name:br_misp_exec type:exclusive default:0xff
 	0xc1 extra: all_conditional Speculative and retired mispredicted macro conditional branches
 	0xc4 extra: all_indirect_jump_non_call_ret Mispredicted indirect branches excluding calls and returns
 	0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls
-name:idq_uops_not_delivered type:exclusive default:0x1
+name:idq_uops_not_delivered type:exclusive default:core
 	0x1 extra: core This event count the number of undelivered (unallocated) uops from the Front-end to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. The Front-end can allocate up to 4 uops per cycle so this event can increment 0-4 times per cycle depending on the number of unallocated uops. This event is counted on a per-core basis.
 	0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts the number cycles during which the Front-end allocated exactly zero uops to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled.  This event is counted on a per-core basis.
 	0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled
 	0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end.
 	0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end.
 	0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE.
-name:uops_executed_port type:exclusive default:0x1
+name:uops_executed_port type:exclusive default:port_0
 	0x1 extra: port_0 Cycles per thread when uops are executed in port 0
 	0x2 extra: port_1 Cycles per thread when uops are executed in port 1
 	0x4 extra: port_2 Cycles per thread when uops are executed in port 2
@@ -236,7 +236,7 @@ name:other_assists type:exclusive default:0x8
 	0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. Errata: HSM57
 	0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. Errata: HSM57
 	0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback.
-name:uops_retired type:exclusive default:0x1
+name:uops_retired type:exclusive default:all
 	0x1 extra: all Actually retired uops.
 	0x1 extra: all_pebs Actually retired uops.
 	0x2 extra: retire_slots This event counts the number of retirement slots used each cycle.  There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle.
@@ -244,13 +244,13 @@ name:uops_retired type:exclusive default:0x1
 	0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops.
 	0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops.
 	0x1 extra:cmask=1,inv core_stall_cycles Cycles without actually retired uops.
-name:machine_clears type:exclusive default:0x1
+name:machine_clears type:exclusive default:cycles
 	0x1 extra: cycles Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes.
 	0x2 extra: memory_ordering This event counts the number of memory ordering machine clears detected. Memory ordering machine clears can result from memory address aliasing or snoops from another hardware thread or core to data inflight in the pipeline.  Machine clears can have a significant performance impact if they are happening frequently.
 	0x4 extra: smc This event is incremented when self-modifying code (SMC) is detected, which causes a machine clear.  Machine clears can have a significant performance impact if they are happening frequently.
 	0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0.
 	0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type.
-name:br_inst_retired type:exclusive default:0x1
+name:br_inst_retired type:exclusive default:conditional
 	0x1 extra: conditional Conditional branch instructions retired.
 	0x1 extra: conditional_pebs Conditional branch instructions retired.
 	0x2 extra: near_call Direct and indirect near call instructions retired.
@@ -262,7 +262,7 @@ name:br_inst_retired type:exclusive default:0x1
 	0x20 extra: near_taken_pebs Taken branch instructions retired.
 	0x40 extra: far_branch Far branch instructions retired.
 	0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired.
-name:br_misp_retired type:exclusive default:0x1
+name:br_misp_retired type:exclusive default:conditional
 	0x1 extra: conditional Mispredicted conditional branch instructions retired.
 	0x1 extra: conditional_pebs Mispredicted conditional branch instructions retired.
 	0x4 extra:pebs all_branches_pebs This event counts all mispredicted branch instructions retired. This is a precise event.
@@ -294,7 +294,7 @@ name:fp_assist type:exclusive default:0x1e
 	0x4 extra: x87_input Number of X87 assists due to input value.
 	0x8 extra: simd_output Number of SIMD FP assists due to Output values
 	0x10 extra: simd_input Number of SIMD FP assists due to input values
-name:mem_uops_retired type:exclusive default:0x11
+name:mem_uops_retired type:exclusive default:stlb_miss_loads
 	0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. Errata: HSM30
 	0x11 extra: stlb_miss_loads_pebs Load uops with true STLB miss retired to architected path. Errata: HSM30
 	0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. Errata: HSM30
@@ -309,7 +309,7 @@ name:mem_uops_retired type:exclusive default:0x11
 	0x81 extra: all_loads_pebs Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30
 	0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30
 	0x82 extra: all_stores_pebs Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30
-name:mem_load_uops_retired type:exclusive default:0x1
+name:mem_load_uops_retired type:exclusive default:l1_hit
 	0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. Errata: HSM30
 	0x1 extra: l1_hit_pebs Retired load uops with L1 cache hits as data sources. Errata: HSM30
 	0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. Errata: HSM30
@@ -324,7 +324,7 @@ name:mem_load_uops_retired type:exclusive default:0x1
 	0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30
 	0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30
 	0x40 extra: hit_lfb_pebs Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30
-name:mem_load_uops_l3_hit_retired type:exclusive default:0x1
+name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss
 	0x1 extra: xsnp_miss Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30
 	0x1 extra: xsnp_miss_pebs Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30
 	0x2 extra: xsnp_hit Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30
@@ -333,7 +333,7 @@ name:mem_load_uops_l3_hit_retired type:exclusive default:0x1
 	0x4 extra: xsnp_hitm_pebs Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30
 	0x8 extra: xsnp_none Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30
 	0x8 extra: xsnp_none_pebs Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30
-name:mem_load_uops_l3_miss_retired type:exclusive default:0x1
+name:mem_load_uops_l3_miss_retired type:exclusive default:local_dram
 	0x1 extra: local_dram This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30
 	0x1 extra: local_dram_pebs This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30
 name:l2_trans type:exclusive default:0x80