diff --git a/SOURCES/oprofile-broadwell.patch b/SOURCES/oprofile-broadwell.patch index c328b12..a08001d 100644 --- a/SOURCES/oprofile-broadwell.patch +++ b/SOURCES/oprofile-broadwell.patch @@ -851,3 +851,190 @@ index 9c27e6c..b8900a5 100644 pe->unit_mask); exit(EXIT_FAILURE); } +commit 62e7814e8467230d8e283992ee6532d5f794359a +Author: Michael Petlan +Date: Thu Jun 11 11:24:51 2015 -0400 + + Fix default unit masks for Intel Broadwell + + Since some of the default unit masks for Intel Broadwell events cannot be + uniquely specified by numbers, the defaults have had to be replaced + by the named ones. When the affected events are used on Broadwell without + specifying unit masks after applying this patch, the default masks + are chosen correctly. + + Signed-off-by: William Cohen + +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +index 0d6ccd5..4e69363 100644 +--- a/events/i386/broadwell/unit_masks ++++ b/events/i386/broadwell/unit_masks +@@ -31,7 +31,7 @@ name:dtlb_load_misses type:exclusive default:0x1 + 0x20 extra: stlb_hit_4k Load misses that miss the DTLB and hit the STLB (4K) + 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. + 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks +-name:uops_issued type:exclusive default:0x1 ++name:uops_issued type:exclusive default:any + 0x1 extra: any This event counts the number of Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS). + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. +@@ -54,7 +54,7 @@ name:l2_rqsts type:exclusive default:0x21 + 0xe7 extra: all_demand_references Demand requests to L2 cache + 0x3f extra: miss All requests that miss L2 cache + 0xff extra: references All L2 requests +-name:l1d_pend_miss type:exclusive default:0x1 ++name:l1d_pend_miss type:exclusive default:pending + 0x1 extra: pending This event counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand; from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. + 0x1 extra:cmask=1 pending_cycles This event counts duration of L1D miss outstanding in cycles. + name:dtlb_store_misses type:exclusive default:0x1 +@@ -77,7 +77,7 @@ name:move_elimination type:exclusive default:0x1 + 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. + 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. + 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. +-name:cpl_cycles type:exclusive default:0x1 ++name:cpl_cycles type:exclusive default:ring0 + 0x1 extra: ring0 This event counts the unhalted core cycles during which the thread is in the ring 0 privileged mode. + 0x2 extra: ring123 This event counts unhalted core cycles during which the thread is in rings 1, 2, or 3. + 0x1 extra:cmask=1,edge ring0_trans This event counts when there is a transition from ring 1,2 or 3 to ring0. +@@ -87,10 +87,10 @@ name:tx_exec type:exclusive default:0x1 + 0x4 extra: misc3 Unfriendly TSX abort triggered by a nest count that is too deep + 0x8 extra: misc4 RTM region detected inside HLE + 0x10 extra: misc5 # HLE inside HLE+ +-name:rs_events type:exclusive default:0x1 ++name:rs_events type:exclusive default:empty_cycles + 0x1 extra: empty_cycles This event counts cycles during which the reservation station (RS) is empty for the thread. Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues. + 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. +-name:offcore_requests_outstanding type:exclusive default:0x1 ++name:offcore_requests_outstanding type:exclusive default:demand_data_rd + 0x1 extra: demand_data_rd This event counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS. Note: A prefetch promoted to Demand is counted from the promotion point. + 0x2 extra: demand_code_rd This event counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The "Offcore outstanding" state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. + 0x4 extra: demand_rfo This event counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. +@@ -147,14 +147,14 @@ name:br_misp_exec type:exclusive default:0xff + 0xc1 extra: all_conditional This event counts both taken and not taken speculative and retired mispredicted macro conditional branch instructions. + 0xc4 extra: all_indirect_jump_non_call_ret This event counts both taken and not taken mispredicted indirect branches excluding calls and returns. + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls +-name:idq_uops_not_delivered type:exclusive default:0x1 ++name:idq_uops_not_delivered type:exclusive default:core + 0x1 extra: core This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding ?4 ? x? when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread; b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); c. Instruction Decode Queue (IDQ) delivers four uops. + 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4. + 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core This event counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >=3. + 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end + 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +-name:uops_executed_port type:exclusive default:0x1 ++name:uops_executed_port type:exclusive default:port_0 + 0x1 extra:any port_0_core Cycles per core when uops are exectuted in port 0 + 0x2 extra:any port_1_core Cycles per core when uops are exectuted in port 1 + 0x4 extra:any port_2_core Cycles per core when uops are dispatched to port 2 +@@ -200,7 +200,7 @@ name:cycle_activity type:exclusive default:0x1 + 0xc extra:cmask=c stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. + 0x5 extra:cmask=5 stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. + 0x6 extra:cmask=6 stalls_mem_any Execution stalls while memory subsystem has an outstanding load. +-name:lsd type:exclusive default:0x1 ++name:lsd type:exclusive default:uops + 0x1 extra: uops Number of Uops delivered by the LSD. Read more on LSD under LSD_REPLAY.REPLAY + 0x1 extra:cmask=4 cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder + 0x1 extra:cmask=1 cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder +@@ -209,7 +209,7 @@ name:offcore_requests type:exclusive default:0x1 + 0x2 extra: demand_code_rd This event counts both cacheable and noncachaeble code read requests. + 0x4 extra: demand_rfo This event counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. + 0x8 extra: all_data_rd This event counts the demand and prefetch data reads. All Core Data Reads include cacheable "Demands" and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. +-name:uops_executed type:exclusive default:0x1 ++name:uops_executed type:exclusive default:thread + 0x1 extra: thread Number of uops to be executed per-thread each cycle. + 0x2 extra: core Number of uops executed from any thread + 0x1 extra:cmask=1,inv stall_cycles This event counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. +@@ -232,20 +232,20 @@ name:other_assists type:exclusive default:0x8 + 0x8 extra: avx_to_sse This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable. + 0x10 extra: sse_to_avx This is a non-precise version (that is, does not use PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable. + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. +-name:uops_retired type:exclusive default:0x1 ++name:uops_retired type:exclusive default:all + 0x1 extra: all This is a non-precise version (that is, does not use PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. + 0x1 extra: all_pebs Counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight. + 0x2 extra: retire_slots This is a non-precise version (that is, does not use PEBS) of the event that counts the number of retirement slots used. + 0x2 extra: retire_slots_pebs Counts the number of retirement slots used. + 0x1 extra:cmask=1,inv stall_cycles This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops. + 0x1 extra:cmask=a,inv total_cycles Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event. +-name:machine_clears type:exclusive default:0x1 ++name:machine_clears type:exclusive default:cycles + 0x1 extra: cycles This event counts both thread-specific (TS) and all-thread (AT) nukes. + 0x2 extra: memory_ordering This event counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following: 1. memory disambiguation, 2. external snoop, or 3. cross SMT-HW-thread snoop (stores) hitting load buffer. + 0x4 extra: smc This event counts self-modifying code (SMC) detected, which causes a machine clear. + 0x20 extra: maskmov Maskmov false fault - counts number of time ucode passes through Maskmov flow due to instruction's mask being 0 while the flow was completed without raising a fault. + 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. +-name:br_inst_retired type:exclusive default:0x1 ++name:br_inst_retired type:exclusive default:conditional + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired. + 0x1 extra: conditional_pebs Counts conditional branch instructions retired. + 0x2 extra: near_call This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired. +@@ -257,7 +257,7 @@ name:br_inst_retired type:exclusive default:0x1 + 0x20 extra: near_taken_pebs Counts taken branch instructions retired. + 0x40 extra: far_branch This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired. + 0x4 extra:pebs all_branches_pebs This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired. +-name:br_misp_retired type:exclusive default:0x1 ++name:br_misp_retired type:exclusive default:conditional + 0x1 extra: conditional This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired. + 0x1 extra: conditional_pebs Counts mispredicted conditional branch instructions retired. + 0x4 extra:pebs all_branches_pebs This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired. +@@ -289,7 +289,7 @@ name:fp_assist type:exclusive default:0x1e + 0x4 extra: x87_input This is a non-precise version (that is, does not use PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid. + 0x8 extra: simd_output This is a non-precise version (that is, does not use PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention. + 0x10 extra: simd_input This is a non-precise version (that is, does not use PEBS) of the event that counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention. +-name:mem_uops_retired type:exclusive default:0x11 ++name:mem_uops_retired type:exclusive default:stlb_miss_loads + 0x11 extra: stlb_miss_loads This is a non-precise version (that is, does not use PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x11 extra: stlb_miss_loads_pebs Counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. + 0x12 extra: stlb_miss_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault. +@@ -304,7 +304,7 @@ name:mem_uops_retired type:exclusive default:0x11 + 0x81 extra: all_loads_pebs Counts load uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches. + 0x82 extra: all_stores This is a non-precise version (that is, does not use PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. + 0x82 extra: all_stores_pebs Counts store uops retired to the architected path with a filter on bits 0 and 1 applied. Note: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. +-name:mem_load_uops_retired type:exclusive default:0x1 ++name:mem_load_uops_retired type:exclusive default:l1_hit + 0x1 extra: l1_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source + 0x1 extra: l1_hit_pebs Counts retired load uops which data sources were hits in the nearest-level (L1) cache. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source + 0x2 extra: l2_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache. +@@ -319,7 +319,7 @@ name:mem_load_uops_retired type:exclusive default:0x1 + 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. + 0x40 extra: hit_lfb This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. + 0x40 extra: hit_lfb_pebs Counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready. Note: Only two data-sources of L1/FB are applicable for AVX-256bit even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. +-name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss + 0x1 extra: xsnp_miss This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. + 0x1 extra: xsnp_miss_pebs Counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache. + 0x2 extra: xsnp_hit This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache. +@@ -328,7 +328,7 @@ name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 + 0x4 extra: xsnp_hitm_pebs Counts retired load uops which data sources were HitM responses from a core on same socket (shared L3). + 0x8 extra: xsnp_none This is a non-precise version (that is, does not use PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. + 0x8 extra: xsnp_none_pebs Counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required. +-name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_miss_retired type:exclusive default:local_dram + 0x1 extra: local_dram Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) + 0x1 extra: local_dram_pebs Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI) + name:l2_trans type:exclusive default:0x80 +commit 723a3042bd23deca01a36f6d99cdf10fe935c0d0 +Author: William Cohen +Date: Thu Jun 11 16:56:16 2015 -0400 + + Use a named default for the Intel Broadwell cycle_activity default unit_mask + + Since default unit mask for Intel Broadwell cycle_activity cannot be + uniquely specified by numbers, the default has to be replaced by a + named one. + + Signed-off-by: William Cohen + +diff --git a/events/i386/broadwell/unit_masks b/events/i386/broadwell/unit_masks +index 4e69363..505ba21 100644 +--- a/events/i386/broadwell/unit_masks ++++ b/events/i386/broadwell/unit_masks +@@ -185,7 +185,7 @@ name:resource_stalls type:exclusive default:0x1 + 0x4 extra: rs This event counts stall cycles caused by absence of eligible entries in the reservation station (RS). This may result from RS overflow, or from RS deallocation because of the RS array Write Port allocation scheme (each RS entry has two write ports instead of four. As a result, empty entries could not be used, although RS is not really full). This counts cycles that the pipeline backend blocked uop delivery from the front end. + 0x8 extra: sb This event counts stall cycles caused by the store buffer (SB) overflow (excluding draining from synch). This counts cycles that the pipeline backend blocked uop delivery from the front end. + 0x10 extra: rob This event counts ROB full stall cycles. This counts cycles that the pipeline backend blocked uop delivery from the front end. +-name:cycle_activity type:exclusive default:0x1 ++name:cycle_activity type:exclusive default:cycles_l2_pending + 0x1 extra:cmask=1 cycles_l2_pending Counts number of cycles the CPU has at least one pending demand* load request missing the L2 cache. + 0x8 extra:cmask=8 cycles_l1d_pending Counts number of cycles the CPU has at least one pending demand load request missing the L1 data cache. + 0x2 extra:cmask=2 cycles_ldm_pending Counts number of cycles the CPU has at least one pending demand load request (that is cycles with non-completed load waiting for its data from memory subsystem) diff --git a/SOURCES/oprofile-bz1264443.patch b/SOURCES/oprofile-bz1264443.patch new file mode 100644 index 0000000..5a340e6 --- /dev/null +++ b/SOURCES/oprofile-bz1264443.patch @@ -0,0 +1,93 @@ +diff -up oprofile-0.9.9/libpp/profile_spec.cpp.archive oprofile-0.9.9/libpp/profile_spec.cpp +--- oprofile-0.9.9/libpp/profile_spec.cpp.archive 2013-07-29 11:55:06.000000000 -0400 ++++ oprofile-0.9.9/libpp/profile_spec.cpp 2016-07-06 11:20:55.076624764 -0400 +@@ -102,6 +102,8 @@ void profile_spec::set_image_or_lib_name + void profile_spec::parse_archive_path(string const & str) + { + archive_path = op_realpath(str); ++ /* Need to force session directory default location in the archive */ ++ init_op_config_dirs(OP_SESSION_DIR_DEFAULT); + } + + +diff -up oprofile-0.9.9/pp/oparchive.cpp.archive oprofile-0.9.9/pp/oparchive.cpp +--- oprofile-0.9.9/pp/oparchive.cpp.archive 2013-07-29 11:55:06.000000000 -0400 ++++ oprofile-0.9.9/pp/oparchive.cpp 2016-07-06 11:20:55.076624764 -0400 +@@ -232,6 +232,19 @@ int oparchive(options::spec const & spec + } + } + ++ /* place samples and other related material in easily found default directory */ ++ string dest_session_dir = options::outdirectory + string(OP_SESSION_DIR_DEFAULT); ++ string dest_samples_dir = dest_session_dir + string("samples"); ++ ++ /* dest_session_dir is parent of dest_samples and will also created */ ++ ++ if (!options::list_files && ++ create_path(dest_samples_dir.c_str())) { ++ cerr << "Unable to create directory for " ++ << dest_samples_dir << "." << endl; ++ exit (EXIT_FAILURE); ++ } ++ + /* copy over each of the sample files */ + list::iterator sit = sample_files.begin(); + list::iterator const send = sample_files.end(); +@@ -245,9 +258,13 @@ int oparchive(options::spec const & spec + + for (; sit != send; ++sit) { + string sample_name = *sit; ++ /* determine the session name of sample file */ ++ int offset = sample_name.find('{'); ++ string base_samples_dir = sample_name.substr(0, offset-1); ++ string session = basename(base_samples_dir.c_str()); + /* Get rid of the the archive_path from the name */ +- string sample_base = sample_name.substr(archive_path.size()); +- string sample_archive_file = options::outdirectory + sample_base; ++ string sample_base = sample_name.substr(offset); ++ string sample_archive_file = dest_samples_dir + "/" + session + "/" + sample_base; + + cverb << vdebug << sample_name << endl; + cverb << vdebug << " destp " << sample_archive_file << endl; +@@ -268,19 +285,19 @@ int oparchive(options::spec const & spec + cerr << "Unable to to obtain realpath for " << op_session_dir << endl; + exit (EXIT_FAILURE); + } +- string abi_name = string(real_session_dir) + "/abi"; +- copy_one_file(image_ok, archive_path + abi_name, +- options::outdirectory + abi_name); ++ string abi_name = string(real_session_dir) + string("/abi"); ++ string dest_abi_name = dest_session_dir + string("/abi"); ++ copy_one_file(image_ok, archive_path + abi_name, dest_abi_name); + + /* copy over the /samples/oprofiled.log file */ +- string log_name = string(real_session_dir) + string("/samples") + "/oprofiled.log"; +- copy_one_file(image_ok, archive_path + log_name, +- options::outdirectory + log_name); ++ string log_name = string(real_session_dir) + string("/samples") + string("/oprofiled.log"); ++ string dest_log_name = dest_samples_dir + string("/oprofiled.log"); ++ copy_one_file(image_ok, archive_path + log_name, dest_log_name); + + /* copy over the /samples/operf.log file */ +- log_name = string(real_session_dir) + string("/samples") + "/operf.log"; +- copy_one_file(image_ok, archive_path + log_name, +- options::outdirectory + log_name); ++ log_name = string(real_session_dir) + string("/samples") + string("/operf.log"); ++ dest_log_name = dest_samples_dir + string("/operf.log"); ++ copy_one_file(image_ok, archive_path + log_name, dest_log_name); + + free(real_session_dir); + +diff -up oprofile-0.9.9/pp/oparchive_options.cpp.archive oprofile-0.9.9/pp/oparchive_options.cpp +--- oprofile-0.9.9/pp/oparchive_options.cpp.archive 2016-07-06 11:20:55.077624764 -0400 ++++ oprofile-0.9.9/pp/oparchive_options.cpp 2016-07-06 11:26:13.968624764 -0400 +@@ -124,7 +124,6 @@ void handle_options(options::spec const + + if (strncmp(op_session_dir, "/var/lib/oprofile", strlen("/var/lib/oprofile"))) + cerr << "NOTE: The sample data in this archive is located at " << op_session_dir << endl +- << "instead of the standard location of /var/lib/oprofile. Hence, when using opreport" << endl +- << "and other post-processing tools on this archive, you must pass the following option:" << endl +- << "\t--session-dir=" << op_session_dir << endl; ++ << "and is being moved to the standard location of " << OP_SESSION_DIR_DEFAULT << "." ++ << endl; + } diff --git a/SOURCES/oprofile-bz1335145.patch b/SOURCES/oprofile-bz1335145.patch new file mode 100644 index 0000000..4eaf6a6 --- /dev/null +++ b/SOURCES/oprofile-bz1335145.patch @@ -0,0 +1,185 @@ +commit a99127699330dce984dba38156230ab3584d0d6e +Author: William Cohen +Date: Mon Nov 30 17:13:32 2015 -0500 + + Make Intel Westmere and Nehalem event names unique + + The Intel Westmere and Nehalem event lists each had two events named + MACRO_INSTS. The event names in the event lists need to be unique. + The event refererring to the Macro-fused instructions decoded (0xa6) + has been renamed MACRO_INSTS_FUSED to avoid the name collision with + MACRO_INSTS. + + Signed-off-by: William Cohen + +diff --git a/events/i386/nehalem/events b/events/i386/nehalem/events +index 31a08b6..6951f35 100644 +--- a/events/i386/nehalem/events ++++ b/events/i386/nehalem/events +@@ -68,7 +68,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:6000 name:ILD_STALL : Cycles In + event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:6000 name:BR_INST_EXEC : Counts the number of near branch instructions executed, but not necessarily retired. + event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:6000 name:BR_MISP_EXEC : Counts the number of mispredicted conditional near branch instructions executed, but not necessarily retired. + event:0xA2 counters:0,1,2,3 um:resource_stalls minimum:6000 name:RESOURCE_STALLS : Counts the number of Allocator resource related stalls. Includes register renaming buffer entries, memory buffer entries. In addition to resource related stalls, this event counts some other events. Includes stalls arising during branch misprediction recovery, such as if retirement of the mispredicted branch is delayed and stalls arising while store buffer is draining from synchronizing operations. +-event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired. ++event:0xA6 counters:0,1,2,3 um:one minimum:6000 name:MACRO_INSTS_FUSED : Counts the number of instructions decoded that are macro-fused but not necessarily executed or retired. + event:0xA7 counters:0,1,2,3 um:one minimum:6000 name:BACLEAR_FORCE_IQ : Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ is also responsible for providing conditional branch prediciton direction based on a static scheme and dynamic data provided by the L2 Branch Prediction Unit. If the conditional branch target is not found in the Target Array and the IQ predicts that the branch is taken, then the IQ will force the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble in the instruction fetch pipeline. + event:0xA8 counters:0,1,2,3 um:one minimum:6000 name:LSD : Counts the number of micro-ops delivered by loop stream detector + event:0xAE counters:0,1,2,3 um:one minimum:6000 name:ITLB_FLUSH : Counts the number of ITLB flushes +diff --git a/events/i386/westmere/events b/events/i386/westmere/events +index d919867..d7b2064 100644 +--- a/events/i386/westmere/events ++++ b/events/i386/westmere/events +@@ -48,7 +48,7 @@ event:0x87 counters:0,1,2,3 um:ild_stall minimum:2000000 name:ILD_STALL : Any In + event:0x88 counters:0,1,2,3 um:br_inst_exec minimum:200000 name:BR_INST_EXEC : Branch instructions executed + event:0x89 counters:0,1,2,3 um:br_misp_exec minimum:20000 name:BR_MISP_EXEC : Mispredicted branches executed + event:0xa2 counters:0,1,2,3 um:resource_stalls minimum:2000000 name:RESOURCE_STALLS : Resource related stall cycles +-event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS : Macro-fused instructions decoded ++event:0xa6 counters:0,1,2,3 um:x01 minimum:2000000 name:MACRO_INSTS_FUSED : Macro-fused instructions decoded + event:0xa7 counters:0,1,2,3 um:x01 minimum:2000000 name:BACLEAR_FORCE_IQ : Instruction queue forced BACLEAR + event:0xa8 counters:0,1,2,3 um:x01 minimum:2000000 name:LSD : Cycles when uops were delivered by the LSD + event:0xae counters:0,1,2,3 um:x01 minimum:2000000 name:ITLB_FLUSH : ITLB flushes +commit dc9076e99c9afada60cbe81dd43772cb72ec509d +Author: Michael Petlan +Date: Thu Apr 30 10:34:48 2015 -0400 + + Fix default unit masks for Haswells + + Since some of the default unit masks for Haswell events cannot be + uniquely specified by numbers, the defaults have had to be replaced + by the named ones. When the affected events are used on Haswell without + specifying unit masks after applying this patch, the default masks + are chosen correctly. + + Signed-off-by: Michael Petlan + +diff --git a/events/i386/haswell/unit_masks b/events/i386/haswell/unit_masks +index 60c2a61..9b4be33 100644 +--- a/events/i386/haswell/unit_masks ++++ b/events/i386/haswell/unit_masks +@@ -32,7 +32,7 @@ name:dtlb_load_misses type:exclusive default:0x1 + 0x80 extra: pde_cache_miss DTLB demand load misses with low part of linear-to-physical address translation missed + 0xe extra: walk_completed Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size. + 0x60 extra: stlb_hit Load operations that miss the first DTLB level but hit the second and do not cause page walks +-name:uops_issued type:exclusive default:0x1 ++name:uops_issued type:exclusive default:any + 0x1 extra: any This event counts the number of uops issued by the Front-end of the pipeline to the Back-end. This event is counted at the allocation stage and will count both retired and non-retired uops. + 0x10 extra: flags_merge Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch. + 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. +@@ -56,7 +56,7 @@ name:l2_rqsts type:exclusive default:0x21 + 0xe7 extra: all_demand_references Demand requests to L2 cache + 0x3f extra: miss All requests that miss L2 cache + 0xff extra: references All L2 requests +-name:l1d_pend_miss type:exclusive default:0x1 ++name:l1d_pend_miss type:exclusive default:pending + 0x1 extra: pending L1D miss oustandings duration in cycles + 0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding. + name:dtlb_store_misses type:exclusive default:0x1 +@@ -85,7 +85,7 @@ name:move_elimination type:exclusive default:0x1 + 0x2 extra: simd_eliminated Number of SIMD Move Elimination candidate uops that were eliminated. + 0x4 extra: int_not_eliminated Number of integer Move Elimination candidate uops that were not eliminated. + 0x8 extra: simd_not_eliminated Number of SIMD Move Elimination candidate uops that were not eliminated. +-name:cpl_cycles type:exclusive default:0x1 ++name:cpl_cycles type:exclusive default:ring0 + 0x1 extra: ring0 Unhalted core cycles when the thread is in ring 0 + 0x2 extra: ring123 Unhalted core cycles when thread is in rings 1, 2, or 3 + 0x1 extra:cmask=1,edge ring0_trans Number of intervals between processor halts while thread is in ring 0 +@@ -95,10 +95,10 @@ name:tx_exec type:exclusive default:0x1 + 0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded + 0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region. + 0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region +-name:rs_events type:exclusive default:0x1 ++name:rs_events type:exclusive default:empty_cycles + 0x1 extra: empty_cycles This event counts cycles when the Reservation Station ( RS ) is empty for the thread. The RS is a structure that buffers allocated micro-ops from the Front-end. If there are many cycles when the RS is empty, it may represent an underflow of instructions delivered from the Front-end. + 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. +-name:offcore_requests_outstanding type:exclusive default:0x1 ++name:offcore_requests_outstanding type:exclusive default:demand_data_rd + 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. + 0x2 extra: demand_code_rd Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle + 0x4 extra: demand_rfo Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore +@@ -164,14 +164,14 @@ name:br_misp_exec type:exclusive default:0xff + 0xc1 extra: all_conditional Speculative and retired mispredicted macro conditional branches + 0xc4 extra: all_indirect_jump_non_call_ret Mispredicted indirect branches excluding calls and returns + 0xa0 extra: taken_indirect_near_call Taken speculative and retired mispredicted indirect calls +-name:idq_uops_not_delivered type:exclusive default:0x1 ++name:idq_uops_not_delivered type:exclusive default:core + 0x1 extra: core This event count the number of undelivered (unallocated) uops from the Front-end to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. The Front-end can allocate up to 4 uops per cycle so this event can increment 0-4 times per cycle depending on the number of unallocated uops. This event is counted on a per-core basis. + 0x1 extra:cmask=4 cycles_0_uops_deliv_core This event counts the number cycles during which the Front-end allocated exactly zero uops to the Resource Allocation Table (RAT) while the Back-end of the processor is not stalled. This event is counted on a per-core basis. + 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled + 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end. + 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end. + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +-name:uops_executed_port type:exclusive default:0x1 ++name:uops_executed_port type:exclusive default:port_0 + 0x1 extra: port_0 Cycles per thread when uops are executed in port 0 + 0x2 extra: port_1 Cycles per thread when uops are executed in port 1 + 0x4 extra: port_2 Cycles per thread when uops are executed in port 2 +@@ -236,7 +236,7 @@ name:other_assists type:exclusive default:0x8 + 0x8 extra: avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable. Errata: HSM57 + 0x10 extra: sse_to_avx Number of transitions from SSE to AVX-256 when penalty applicable. Errata: HSM57 + 0x40 extra: any_wb_assist Number of times any microcode assist is invoked by HW upon uop writeback. +-name:uops_retired type:exclusive default:0x1 ++name:uops_retired type:exclusive default:all + 0x1 extra: all Actually retired uops. + 0x1 extra: all_pebs Actually retired uops. + 0x2 extra: retire_slots This event counts the number of retirement slots used each cycle. There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle. +@@ -244,13 +244,13 @@ name:uops_retired type:exclusive default:0x1 + 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. + 0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops. + 0x1 extra:cmask=1,inv core_stall_cycles Cycles without actually retired uops. +-name:machine_clears type:exclusive default:0x1 ++name:machine_clears type:exclusive default:cycles + 0x1 extra: cycles Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes. + 0x2 extra: memory_ordering This event counts the number of memory ordering machine clears detected. Memory ordering machine clears can result from memory address aliasing or snoops from another hardware thread or core to data inflight in the pipeline. Machine clears can have a significant performance impact if they are happening frequently. + 0x4 extra: smc This event is incremented when self-modifying code (SMC) is detected, which causes a machine clear. Machine clears can have a significant performance impact if they are happening frequently. + 0x20 extra: maskmov This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0. + 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. +-name:br_inst_retired type:exclusive default:0x1 ++name:br_inst_retired type:exclusive default:conditional + 0x1 extra: conditional Conditional branch instructions retired. + 0x1 extra: conditional_pebs Conditional branch instructions retired. + 0x2 extra: near_call Direct and indirect near call instructions retired. +@@ -262,7 +262,7 @@ name:br_inst_retired type:exclusive default:0x1 + 0x20 extra: near_taken_pebs Taken branch instructions retired. + 0x40 extra: far_branch Far branch instructions retired. + 0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired. +-name:br_misp_retired type:exclusive default:0x1 ++name:br_misp_retired type:exclusive default:conditional + 0x1 extra: conditional Mispredicted conditional branch instructions retired. + 0x1 extra: conditional_pebs Mispredicted conditional branch instructions retired. + 0x4 extra:pebs all_branches_pebs This event counts all mispredicted branch instructions retired. This is a precise event. +@@ -294,7 +294,7 @@ name:fp_assist type:exclusive default:0x1e + 0x4 extra: x87_input Number of X87 assists due to input value. + 0x8 extra: simd_output Number of SIMD FP assists due to Output values + 0x10 extra: simd_input Number of SIMD FP assists due to input values +-name:mem_uops_retired type:exclusive default:0x11 ++name:mem_uops_retired type:exclusive default:stlb_miss_loads + 0x11 extra: stlb_miss_loads Load uops with true STLB miss retired to architected path. Errata: HSM30 + 0x11 extra: stlb_miss_loads_pebs Load uops with true STLB miss retired to architected path. Errata: HSM30 + 0x12 extra: stlb_miss_stores Store uops with true STLB miss retired to architected path. Errata: HSM30 +@@ -309,7 +309,7 @@ name:mem_uops_retired type:exclusive default:0x11 + 0x81 extra: all_loads_pebs Load uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 + 0x82 extra: all_stores Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 + 0x82 extra: all_stores_pebs Store uops retired to architected path with filter on bits 0 and 1 applied. Errata: HSM30 +-name:mem_load_uops_retired type:exclusive default:0x1 ++name:mem_load_uops_retired type:exclusive default:l1_hit + 0x1 extra: l1_hit Retired load uops with L1 cache hits as data sources. Errata: HSM30 + 0x1 extra: l1_hit_pebs Retired load uops with L1 cache hits as data sources. Errata: HSM30 + 0x2 extra: l2_hit Retired load uops with L2 cache hits as data sources. Errata: HSM30 +@@ -324,7 +324,7 @@ name:mem_load_uops_retired type:exclusive default:0x1 + 0x20 extra: l3_miss_pebs Miss in last-level (L3) cache. Excludes Unknown data-source. Errata: HSM26, HSM30 + 0x40 extra: hit_lfb Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 + 0x40 extra: hit_lfb_pebs Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. Errata: HSM30 +-name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_hit_retired type:exclusive default:xsnp_miss + 0x1 extra: xsnp_miss Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 + 0x1 extra: xsnp_miss_pebs Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Errata: HSM26, HSM30 + 0x2 extra: xsnp_hit Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. Errata: HSM26, HSM30 +@@ -333,7 +333,7 @@ name:mem_load_uops_l3_hit_retired type:exclusive default:0x1 + 0x4 extra: xsnp_hitm_pebs Retired load uops which data sources were HitM responses from shared L3. Errata: HSM26, HSM30 + 0x8 extra: xsnp_none Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 + 0x8 extra: xsnp_none_pebs Retired load uops which data sources were hits in L3 without snoops required. Errata: HSM26, HSM30 +-name:mem_load_uops_l3_miss_retired type:exclusive default:0x1 ++name:mem_load_uops_l3_miss_retired type:exclusive default:local_dram + 0x1 extra: local_dram This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 + 0x1 extra: local_dram_pebs This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. Errata: HSM30 + name:l2_trans type:exclusive default:0x80 diff --git a/SOURCES/oprofile-captest.patch b/SOURCES/oprofile-captest.patch new file mode 100644 index 0000000..6f16604 --- /dev/null +++ b/SOURCES/oprofile-captest.patch @@ -0,0 +1,13 @@ +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index c5b6ee7..0550fa7 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -174,6 +174,8 @@ int op_pe_utils::op_check_perf_events_cap(bool use_cpu_minus_one) + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.sample_type = PERF_SAMPLE_IP; ++ /* avoid kernel events so test works when perf_event_paranoid = 2 */ ++ attr.exclude_kernel =1; + + pid = getpid(); + syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0); diff --git a/SOURCES/oprofile-goldmont.patch b/SOURCES/oprofile-goldmont.patch new file mode 100644 index 0000000..e792b54 --- /dev/null +++ b/SOURCES/oprofile-goldmont.patch @@ -0,0 +1,492 @@ +commit 0ad5a9e6af86a88e1dd41180f45bc48b646eba6a +Author: Andi Kleen +Date: Tue Apr 26 07:52:51 2016 -0700 + + oprofile: Add support for Goldmont events + + Add support for the Intel Goldmont events. + + OFFCORE_RESPONSE.* is not supported. + + v2: Fix typos in descriptions. + v3: Add inst_retired.any_pebs + Signed-off-by: Andi Kleen + +diff --git a/events/Makefile.am b/events/Makefile.am +index 56f9020..677b05f 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -20,6 +20,7 @@ event_files = \ + i386/broadwell/events i386/broadwell/unit_masks \ + i386/skylake/events i386/skylake/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ ++ i386/goldmont/events i386/goldmont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ + ia64/itanium/events ia64/itanium/unit_masks \ + +diff --git a/events/i386/goldmont/events b/events/i386/goldmont/events +new file mode 100644 +index 0000000..111438e +--- /dev/null ++++ b/events/i386/goldmont/events +@@ -0,0 +1,34 @@ ++# ++# Intel "Goldmont" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Goldmont based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x03 counters:cpuid um:ld_blocks minimum:200003 name:ld_blocks : ++event:0x05 counters:cpuid um:page_walks minimum:200003 name:page_walks : ++event:0x0e counters:cpuid um:uops_issued minimum:200003 name:uops_issued_any : ++event:0x13 counters:cpuid um:misalign_mem_ref minimum:200003 name:misalign_mem_ref : ++event:0x2e counters:cpuid um:longest_lat_cache minimum:200003 name:longest_lat_cache : ++event:0x30 counters:cpuid um:l2_reject_xq minimum:200003 name:l2_reject_xq_all : ++event:0x31 counters:cpuid um:core_reject_l2q minimum:200003 name:core_reject_l2q_all : ++event:0x51 counters:cpuid um:dl1 minimum:200003 name:dl1_dirty_eviction : ++event:0x80 counters:cpuid um:icache minimum:200003 name:icache : ++event:0x81 counters:cpuid um:itlb minimum:200003 name:itlb_miss : ++event:0x86 counters:cpuid um:fetch_stall minimum:200003 name:fetch_stall_icache_fill_pending_cycles : ++event:0x9c counters:cpuid um:uops_not_delivered minimum:200003 name:uops_not_delivered_any : ++event:0xc0 counters:cpuid um:inst_retired minimum:2000003 name:inst_retired : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:200003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:200003 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:200003 name:br_misp_retired : ++event:0xca counters:cpuid um:issue_slots_not_consumed minimum:200003 name:issue_slots_not_consumed : ++event:0xcb counters:cpuid um:hw_interrupts minimum:200003 name:hw_interrupts : ++event:0xcd counters:cpuid um:cycles_div_busy minimum:2000003 name:cycles_div_busy : ++event:0xd0 counters:cpuid um:mem_uops_retired minimum:200003 name:mem_uops_retired : ++event:0xd1 counters:cpuid um:mem_load_uops_retired minimum:200003 name:mem_load_uops_retired : ++event:0xe6 counters:cpuid um:baclears minimum:200003 name:baclears : ++event:0xe7 counters:cpuid um:ms_decoded minimum:200003 name:ms_decoded_ms_entry : ++event:0xe9 counters:cpuid um:decode_restriction minimum:200003 name:decode_restriction_predecode_wrong : +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +new file mode 100644 +index 0000000..2f265b3 +--- /dev/null ++++ b/events/i386/goldmont/unit_masks +@@ -0,0 +1,155 @@ ++# ++# Unit masks for the Intel "Goldmont" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Goldmont based CPUs ++# ++name:core_reject_l2q type:mandatory default:0x0 ++ 0x0 extra: all Counts the number of demand and L1 prefetcher requests rejected by the L2Q due to a full or nearly full condition which likely indicates back pressure from L2Q. It also counts requests that would have gone directly to the XQ, but are rejected due to a full or nearly full condition, indicating back pressure from the IDI link. The L2Q may also reject transactions from a core to insure fairness between cores, or to delay a core's dirty eviction when the address conflicts with incoming external snoops. ++name:decode_restriction type:mandatory default:0x1 ++ 0x1 extra: predecode_wrong Counts the number of times the prediction (from the predecode cache) for instruction length is incorrect. ++name:dl1 type:mandatory default:0x1 ++ 0x1 extra: dirty_eviction Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback. ++name:fetch_stall type:mandatory default:0x2 ++ 0x2 extra: icache_fill_pending_cycles Counts the number of cycles fetch stalls because of an icache miss. This is a cummulative count of cycles stalled for all icache misses. ++name:itlb type:mandatory default:0x4 ++ 0x4 extra: miss Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB. ++name:l2_reject_xq type:mandatory default:0x0 ++ 0x0 extra: all Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims. ++name:ms_decoded type:mandatory default:0x1 ++ 0x1 extra: ms_entry Counts the number of times the Microcde Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. ++name:uops_issued type:mandatory default:0x0 ++ 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. ++name:uops_not_delivered type:mandatory default:0x0 ++ 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. ++name:cpu_clk_unhalted type:exclusive default:core ++ 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. ++ 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event ++ 0x0 extra: core_p Core cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. ++ 0x1 extra: ref Reference cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter. ++name:ld_blocks type:exclusive default:all_block ++ 0x10 extra: all_block Counts anytime a load that retires is blocked for any reason. ++ 0x10 extra:pebs all_block_pebs Counts anytime a load that retires is blocked for any reason. ++ 0x8 extra: utlb_miss Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x8 extra:pebs utlb_miss_pebs Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x1 extra: data_unknown Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. ++ 0x1 extra:pebs data_unknown_pebs Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. ++ 0x4 extra: u4k_alias Counts loads that block because their address modulo 4K matches a pending store. ++ 0x4 extra:pebs u4k_alias_pebs Counts loads that block because their address modulo 4K matches a pending store. ++name:page_walks type:exclusive default:0x1 ++ 0x1 extra: d_side_cycles Counts every core cycle when a Data-side walks (due to data operation) page walk is in progress. ++ 0x2 extra: i_side_cycles Counts every core cycle when a Instruction-side (walks due to an instruction fetch) page walk is in progress. ++ 0x3 extra: cycles Counts every core cycle a page-walk is in progress due to either a data memory operation or an instruction fetch. ++name:misalign_mem_ref type:exclusive default:load_page_split ++ 0x2 extra: load_page_split Counts when a memory load of a uop spans a page boundary (a split) is retired. ++ 0x2 extra:pebs load_page_split_pebs Counts when a memory load of a uop spans a page boundary (a split) is retired. ++ 0x4 extra: store_page_split Counts when a memory store of a uop spans a page boundary (a split) is retired. ++ 0x4 extra:pebs store_page_split_pebs Counts when a memory store of a uop spans a page boundary (a split) is retired. ++name:longest_lat_cache type:exclusive default:0x4f ++ 0x4f extra: reference Counts memory requests originating from the core that reference a cache line in the L2 cache. ++ 0x41 extra: miss Counts memory requests originating from the core that miss in the L2 cache. ++name:icache type:exclusive default:0x1 ++ 0x1 extra: hit Counts each cache line access to the Icache that are fulfilled (hit) by the Icache ++ 0x2 extra: misses Counts each cache line access to the Icache that are not fullfilled (miss) by the Icache ++ 0x3 extra: accesses Counts each cache line access to the Icache ++name:inst_retired type:exclusive default:any ++ 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event ++ 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. ++ 0x0 extra:pebs any_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++name:uops_retired type:exclusive default:any ++ 0x0 extra: any Counts uops which retired ++ 0x0 extra:pebs any_pebs Counts uops which retired ++ 0x1 extra: ms Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. ++ 0x1 extra:pebs ms_pebs Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. ++ 0x8 extra: fpdiv Counts the number of floating point divide uops retired. ++ 0x8 extra:pebs fpdiv_pebs Counts the number of floating point divide uops retired. ++ 0x10 extra: idiv Counts the number of integer divide uops retired. ++ 0x10 extra:pebs idiv_pebs Counts the number of integer divide uops retired. ++name:machine_clears type:exclusive default:0x0 ++ 0x0 extra: all Counts machine clears for any reason ++ 0x1 extra: smc Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors. ++ 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved, as another core is in the process of modifying the data. ++ 0x4 extra: fp_assist Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result. ++ 0x8 extra: disambiguation Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address. ++name:br_inst_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches Counts branch instructions retired for all branch types. This is an architectural performance event. ++ 0x0 extra:pebs all_branches_pebs Counts branch instructions retired for all branch types. This is an architectural performance event. ++ 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xf9 extra: call Counts near CALL branch instructions retired. ++ 0xf9 extra:pebs call_pebs Counts near CALL branch instructions retired. ++ 0xfd extra: rel_call Counts near relative CALL branch instructions retired. ++ 0xfd extra:pebs rel_call_pebs Counts near relative CALL branch instructions retired. ++ 0xfb extra: ind_call Counts near indirect CALL branch instructions retired. ++ 0xfb extra:pebs ind_call_pebs Counts near indirect CALL branch instructions retired. ++ 0xf7 extra: return Counts near return branch instructions retired. ++ 0xf7 extra:pebs return_pebs Counts near return branch instructions retired. ++ 0xeb extra: non_return_ind Counts near indirect call or near indirect jmp branch instructions retired. ++ 0xeb extra:pebs non_return_ind_pebs Counts near indirect call or near indirect jmp branch instructions retired. ++ 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++ 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++name:br_misp_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches Counts mispredicted branch instructions retired including all branch types. ++ 0x0 extra:pebs all_branches_pebs Counts mispredicted branch instructions retired including all branch types. ++ 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0xfe extra: taken_jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. ++ 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xfb extra:pebs ind_call_pebs Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xeb extra: non_return_ind Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. ++ 0xeb extra:pebs non_return_ind_pebs Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. ++name:issue_slots_not_consumed type:exclusive default:0x0 ++ 0x0 extra: any Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY) ++ 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. ++ 0x2 extra: recovery Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue. ++name:hw_interrupts type:exclusive default:0x1 ++ 0x1 extra: received Counts hardware interrupts received by the processor. ++ 0x4 extra: pending_and_masked Counts core cycles during which there are pending interrupts, but interrupts are masked (EFLAGS.IF = 0). ++name:cycles_div_busy type:exclusive default:0x0 ++ 0x0 extra: all Counts core cycles if either divide unit is busy. ++ 0x1 extra: idiv Counts core cycles the integer divide unit is busy. ++ 0x2 extra: fpdiv Counts core cycles the floating point divide unit is busy. ++name:mem_uops_retired type:exclusive default:all ++ 0x83 extra: all Counts the number of memory uops retired that is either a loads or a store or both. ++ 0x81 extra: all_loads Counts the number of load uops retired ++ 0x81 extra:pebs all_loads_pebs Counts the number of load uops retired ++ 0x82 extra: all_stores Counts the number of store uops retired ++ 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired ++ 0x83 extra:pebs all_pebs Counts the number of memory uops retired that is either a loads or a store or both. ++ 0x11 extra: dtlb_miss_loads Counts load uops retired that caused a DTLB miss. ++ 0x11 extra:pebs dtlb_miss_loads_pebs Counts load uops retired that caused a DTLB miss. ++ 0x12 extra: dtlb_miss_stores Counts store uops retired that caused a DTLB miss. ++ 0x12 extra:pebs dtlb_miss_stores_pebs Counts store uops retired that caused a DTLB miss. ++ 0x13 extra: dtlb_miss Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. ++ 0x13 extra:pebs dtlb_miss_pebs Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. ++ 0x21 extra: lock_loads Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. ++ 0x21 extra:pebs lock_loads_pebs Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. ++ 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++name:mem_load_uops_retired type:exclusive default:l1_hit ++ 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache ++ 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache ++ 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache ++ 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache ++ 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache ++ 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache ++ 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache ++ 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache ++ 0x20 extra: hitm Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. ++ 0x20 extra:pebs hitm_pebs Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. ++ 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++name:baclears type:exclusive default:0x1 ++ 0x1 extra: all Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns. ++ 0x8 extra: return Counts BACLEARS on return instructions. ++ 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Conditon is Met) branches. +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index b1d5ecf..7bdde53 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -122,6 +122,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, + { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, ++ { "Intel Goldmont microarchitecture", "i386/goldmont", CPU_GOLDMONT, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -739,6 +740,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_HASWELL: + case CPU_BROADWELL: + case CPU_SKYLAKE: ++ case CPU_GOLDMONT: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 9983f87..98289c5 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -102,6 +102,7 @@ typedef enum { + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ + CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ ++ CPU_GOLDMONT, /** < Intel Goldmont microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 25f010e..cdd0409 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1212,6 +1212,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + descr->name = "CPU_CLK_UNHALTED"; + break; + ++ case CPU_GOLDMONT: + case CPU_SKYLAKE: + descr->name = "cpu_clk_unhalted"; + break; +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index a6180f4..f4db8f5 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -162,6 +162,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4d: + case 0x4c: + return CPU_SILVERMONT; ++ case 0x5c: ++ case 0x5f: ++ return CPU_GOLDMONT; + } + } + return cpu_type; +diff --git a/utils/ophelp.c b/utils/ophelp.c +index fdddddc..5821593 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -544,6 +544,7 @@ int main(int argc, char const * argv[]) + case CPU_BROADWELL: + case CPU_SKYLAKE: + case CPU_SILVERMONT: ++ case CPU_GOLDMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: + case CPU_IVYBRIDGE: +commit 6f2758a46554f69403e2ebc1a3e4a58350682638 +Author: Andi Kleen +Date: Fri May 6 12:11:46 2016 -0700 + + oprofile: Update Goldmont events + + This patch adds some updates to the Goldmont events. Mainly it is editorial updates + to the event descriptions. In addition it also removes the events not listed + in the SDM (which were not intended to be included) + + v2: Minor edits + Signed-off-by: Andi Kleen + +diff --git a/events/i386/goldmont/unit_masks b/events/i386/goldmont/unit_masks +index 2f265b3..d1c08d4 100644 +--- a/events/i386/goldmont/unit_masks ++++ b/events/i386/goldmont/unit_masks +@@ -10,17 +10,17 @@ name:decode_restriction type:mandatory default:0x1 + name:dl1 type:mandatory default:0x1 + 0x1 extra: dirty_eviction Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback. + name:fetch_stall type:mandatory default:0x2 +- 0x2 extra: icache_fill_pending_cycles Counts the number of cycles fetch stalls because of an icache miss. This is a cummulative count of cycles stalled for all icache misses. ++ 0x2 extra: icache_fill_pending_cycles Counts cycles that an ICache miss is outstanding, and instruction fetch is stalled. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes, while an Icache miss outstanding. Note this event is not the same as cycles to retrieve an instruction due to an Icache miss. Rather, it is the part of the Instruction Cache (ICache) miss time where no bytes are available for the decoder. + name:itlb type:mandatory default:0x4 + 0x4 extra: miss Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB. + name:l2_reject_xq type:mandatory default:0x0 + 0x0 extra: all Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims. + name:ms_decoded type:mandatory default:0x1 +- 0x1 extra: ms_entry Counts the number of times the Microcde Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. ++ 0x1 extra: ms_entry Counts the number of times the Microcode Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear. + name:uops_issued type:mandatory default:0x0 + 0x0 extra: any Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear. + name:uops_not_delivered type:mandatory default:0x0 +- 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. ++ 0x0 extra: any This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into uops in machine understandable format and putting them into a uop queue to be consumed by back end. The back-end then takes these uops, allocates the required resources. When all resources are ready, uops are executed. If the back-end is not ready to accept uops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more uops. This event counts only when back-end is requesting more uops and front-end is not able to provide them. When 3 uops are requested and no uops are delivered, the event counts 3. When 3 are requested, and only 1 is delivered, the event counts 2. When only 2 are delivered, the event counts 1. Alternatively stated, the event will not count if 3 uops are delivered, or if the back end is stalled and not requesting any uops at all. Counts indicate missed opportunities for the front-end to deliver a uop to the back end. Some examples of conditions that cause front-end efficiencies are: ICache misses, ITLB misses, and decoder restrictions that limit the front-end bandwidth. Known Issues: Some uops require multiple allocation slots. These uops will not be charged as a front end 'not delivered' opportunity, and will be regarded as a back end problem. For example, the INC instruction has one uop that requires 2 issue slots. A stream of INC instructions will not count as UOPS_NOT_DELIVERED, even though only one instruction can be issued per clock. The low uop issue rate for a stream of INC instructions is considered to be a back end issue. + name:cpu_clk_unhalted type:exclusive default:core + 0x2 extra: core Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event. + 0x1 extra: ref_tsc Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event +@@ -31,12 +31,14 @@ name:ld_blocks type:exclusive default:all_block + 0x10 extra:pebs all_block_pebs Counts anytime a load that retires is blocked for any reason. + 0x8 extra: utlb_miss Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). + 0x8 extra:pebs utlb_miss_pebs Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB). ++ 0x2 extra: store_forward Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted. ++ 0x2 extra:pebs store_forward_pebs Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted. + 0x1 extra: data_unknown Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. + 0x1 extra:pebs data_unknown_pebs Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available. + 0x4 extra: u4k_alias Counts loads that block because their address modulo 4K matches a pending store. + 0x4 extra:pebs u4k_alias_pebs Counts loads that block because their address modulo 4K matches a pending store. + name:page_walks type:exclusive default:0x1 +- 0x1 extra: d_side_cycles Counts every core cycle when a Data-side walks (due to data operation) page walk is in progress. ++ 0x1 extra: d_side_cycles Counts every core cycle when a Data-side (walks due to a data operation) page walk is in progress. + 0x2 extra: i_side_cycles Counts every core cycle when a Instruction-side (walks due to an instruction fetch) page walk is in progress. + 0x3 extra: cycles Counts every core cycle a page-walk is in progress due to either a data memory operation or an instruction fetch. + name:misalign_mem_ref type:exclusive default:load_page_split +@@ -48,35 +50,31 @@ name:longest_lat_cache type:exclusive default:0x4f + 0x4f extra: reference Counts memory requests originating from the core that reference a cache line in the L2 cache. + 0x41 extra: miss Counts memory requests originating from the core that miss in the L2 cache. + name:icache type:exclusive default:0x1 +- 0x1 extra: hit Counts each cache line access to the Icache that are fulfilled (hit) by the Icache +- 0x2 extra: misses Counts each cache line access to the Icache that are not fullfilled (miss) by the Icache +- 0x3 extra: accesses Counts each cache line access to the Icache ++ 0x1 extra: hit Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is in the ICache (hit). The event strives to count on a cache line basis, so that multiple accesses which hit in a single cache line count as one ICACHE.HIT. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture. ++ 0x2 extra: misses Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is not in the ICache (miss). The event strives to count on a cache line basis, so that multiple accesses which miss in a single cache line count as one ICACHE.MISS. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is not in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture. ++ 0x3 extra: accesses Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line. The event strives to count on a cache line basis, so that multiple fetches to a single cache line count as one ICACHE.ACCESS. Specifically, the event counts when accesses from straight line code crosses the cache line boundary, or when a branch target is to a new line. This event counts differently than Intel processors based on Silvermont microarchitecture. + name:inst_retired type:exclusive default:any +- 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event +- 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. +- 0x0 extra:pebs any_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++ 0x0 extra: any Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event. ++ 0x0 extra: any_p Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. ++ 0x0 extra:pebs any_p_pebs Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time. + name:uops_retired type:exclusive default:any + 0x0 extra: any Counts uops which retired + 0x0 extra:pebs any_pebs Counts uops which retired + 0x1 extra: ms Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. + 0x1 extra:pebs ms_pebs Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist. +- 0x8 extra: fpdiv Counts the number of floating point divide uops retired. +- 0x8 extra:pebs fpdiv_pebs Counts the number of floating point divide uops retired. +- 0x10 extra: idiv Counts the number of integer divide uops retired. +- 0x10 extra:pebs idiv_pebs Counts the number of integer divide uops retired. + name:machine_clears type:exclusive default:0x0 + 0x0 extra: all Counts machine clears for any reason + 0x1 extra: smc Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors. +- 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved, as another core is in the process of modifying the data. ++ 0x2 extra: memory_ordering Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved - as another core is in the process of modifying the data. + 0x4 extra: fp_assist Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result. + 0x8 extra: disambiguation Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address. + name:br_inst_retired type:exclusive default:all_branches + 0x0 extra: all_branches Counts branch instructions retired for all branch types. This is an architectural performance event. + 0x0 extra:pebs all_branches_pebs Counts branch instructions retired for all branch types. This is an architectural performance event. +- 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. +- 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was taken and when it was not taken. +- 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. +- 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0x7e extra: jcc Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0x7e extra:pebs jcc_pebs Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken. ++ 0xfe extra: taken_jcc Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. ++ 0xfe extra:pebs taken_jcc_pebs Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken. + 0xf9 extra: call Counts near CALL branch instructions retired. + 0xf9 extra:pebs call_pebs Counts near CALL branch instructions retired. + 0xfd extra: rel_call Counts near relative CALL branch instructions retired. +@@ -87,24 +85,24 @@ name:br_inst_retired type:exclusive default:all_branches + 0xf7 extra:pebs return_pebs Counts near return branch instructions retired. + 0xeb extra: non_return_ind Counts near indirect call or near indirect jmp branch instructions retired. + 0xeb extra:pebs non_return_ind_pebs Counts near indirect call or near indirect jmp branch instructions retired. +- 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). +- 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. Intel Architecture uses far branches to transition to a different privilege level (ex: kernel/user). ++ 0xbf extra: far_branch Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. ++ 0xbf extra:pebs far_branch_pebs Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return. + name:br_misp_retired type:exclusive default:all_branches + 0x0 extra: all_branches Counts mispredicted branch instructions retired including all branch types. + 0x0 extra:pebs all_branches_pebs Counts mispredicted branch instructions retired including all branch types. +- 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). +- 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Conditon is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra: jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). ++ 0x7e extra:pebs jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition). + 0xfe extra: taken_jcc Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. + 0xfe extra:pebs taken_jcc_pebs Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken. +- 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. +- 0xfb extra:pebs ind_call_pebs Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. +- 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. +- 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xfb extra: ind_call Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xfb extra:pebs ind_call_pebs counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted. ++ 0xf7 extra: return Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. ++ 0xf7 extra:pebs return_pebs Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted. + 0xeb extra: non_return_ind Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. + 0xeb extra:pebs non_return_ind_pebs Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted. + name:issue_slots_not_consumed type:exclusive default:0x0 + 0x0 extra: any Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY) +- 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. ++ 0x1 extra: resource_full Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited to resources such as the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count. + 0x2 extra: recovery Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue. + name:hw_interrupts type:exclusive default:0x1 + 0x1 extra: received Counts hardware interrupts received by the processor. +@@ -117,8 +115,8 @@ name:mem_uops_retired type:exclusive default:all + 0x83 extra: all Counts the number of memory uops retired that is either a loads or a store or both. + 0x81 extra: all_loads Counts the number of load uops retired + 0x81 extra:pebs all_loads_pebs Counts the number of load uops retired +- 0x82 extra: all_stores Counts the number of store uops retired +- 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired ++ 0x82 extra: all_stores Counts the number of store uops retired. ++ 0x82 extra:pebs all_stores_pebs Counts the number of store uops retired. + 0x83 extra:pebs all_pebs Counts the number of memory uops retired that is either a loads or a store or both. + 0x11 extra: dtlb_miss_loads Counts load uops retired that caused a DTLB miss. + 0x11 extra:pebs dtlb_miss_loads_pebs Counts load uops retired that caused a DTLB miss. +@@ -128,28 +126,28 @@ name:mem_uops_retired type:exclusive default:all + 0x13 extra:pebs dtlb_miss_pebs Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss. + 0x21 extra: lock_loads Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. + 0x21 extra:pebs lock_loads_pebs Counts locked memory uops retired. This includes "regular" locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks. +- 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundry. +- 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundry. +- 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundry. +- 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundry. +- 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundry. +- 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundry. ++ 0x41 extra: split_loads Counts load uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x41 extra:pebs split_loads_pebs Counts load uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x42 extra: split_stores Counts store uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x42 extra:pebs split_stores_pebs Counts store uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x43 extra: split Counts memory uops retired where the data requested spans a 64 byte cache line boundary. ++ 0x43 extra:pebs split_pebs Counts memory uops retired where the data requested spans a 64 byte cache line boundary. + name:mem_load_uops_retired type:exclusive default:l1_hit +- 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache +- 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache +- 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache +- 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache +- 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache +- 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache +- 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache +- 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache ++ 0x1 extra: l1_hit Counts load uops retired that hit the L1 data cache. ++ 0x1 extra:pebs l1_hit_pebs Counts load uops retired that hit the L1 data cache. ++ 0x8 extra: l1_miss Counts load uops retired that miss the L1 data cache. ++ 0x8 extra:pebs l1_miss_pebs Counts load uops retired that miss the L1 data cache. ++ 0x2 extra: l2_hit Counts load uops retired that hit in the L2 cache. ++ 0x2 extra:pebs l2_hit_pebs Counts load uops retired that hit in the L2 cache. ++ 0x10 extra: l2_miss Counts load uops retired that miss in the L2 cache. ++ 0x10 extra:pebs l2_miss_pebs Counts load uops retired that miss in the L2 cache. + 0x20 extra: hitm Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. + 0x20 extra:pebs hitm_pebs Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks. +- 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. +- 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data , but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. +- 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. +- 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirment, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x40 extra: wcb_hit Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x40 extra:pebs wcb_hit_pebs Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs. ++ 0x80 extra: dram_hit Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. ++ 0x80 extra:pebs dram_hit_pebs Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response. + name:baclears type:exclusive default:0x1 + 0x1 extra: all Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns. + 0x8 extra: return Counts BACLEARS on return instructions. +- 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Conditon is Met) branches. ++ 0x10 extra: cond Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Condition is Met) branches. diff --git a/SOURCES/oprofile-order.patch b/SOURCES/oprofile-order.patch new file mode 100644 index 0000000..071590e --- /dev/null +++ b/SOURCES/oprofile-order.patch @@ -0,0 +1,59 @@ +From c95158840a7914d558a93b044c5ab0eeb0ea9337 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Tue, 9 Aug 2016 22:25:52 -0400 +Subject: [PATCH] Only start the application if the perf events setup was + successful + +The code was starting the application before the performance events +were setup. In some cases the the setup of the perf events may fail +and the code needs to verify that the performance events have been +successfully set up before starting the application. Changed the +order of those steps to allow a check of the perf event setup before +launching the application. + +Signed-off-by: William Cohen +--- + pe_counting/ocount.cpp | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 4d9c104..7717717 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -257,16 +257,6 @@ bool start_counting(void) + proc_list = ocount_options::processes; + } + +- if (startApp) { +- // Tell app_PID to start the app +- cverb << vdebug << "telling child to start app" << endl; +- if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) { +- perror("Internal error on start_app_pipe"); +- return -1; +- } +- app_started = true; +- } +- + orecord = new ocount_record(runmode, events, ocount_options::display_interval ? true : false); + bool ret; + switch (runmode) { +@@ -300,6 +290,16 @@ bool start_counting(void) + ret = false; + } + ++ if (startApp && ret != false) { ++ // Tell app_PID to start the app ++ cverb << vdebug << "telling child to start app" << endl; ++ if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) { ++ perror("Internal error on start_app_pipe"); ++ return false; ++ } ++ app_started = true; ++ } ++ + return ret; + } + +-- +2.7.4 + diff --git a/SOURCES/oprofile-skylake.patch b/SOURCES/oprofile-skylake.patch index b433452..201b697 100644 --- a/SOURCES/oprofile-skylake.patch +++ b/SOURCES/oprofile-skylake.patch @@ -673,3 +673,48 @@ index b505769..6e81a63 100644 -- 2.4.3 +commit 635d1f59ff198a43deb9482cdec10795222e506a +Author: Andi Kleen +Date: Fri Apr 15 13:14:51 2016 -0700 + + Add model number of Skylake server to oprofile + + Just reuse the event list of Skylake client. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 994fec4..a6180f4 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -156,6 +156,7 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + return CPU_BROADWELL; + case 0x4e: + case 0x5e: ++ case 0x55: + return CPU_SKYLAKE; + case 0x37: + case 0x4d: +commit 402cad1b6f5605ed854eb8b7b7376cafce3fb007 +Author: Andi Kleen +Date: Fri Apr 29 17:50:25 2016 -0700 + + oprofile: Add model numbers for Kabylake CPUs + + The PMU is using the same events as Skylake, so no other changes. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index f4db8f5..2061760 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -157,6 +157,8 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4e: + case 0x5e: + case 0x55: ++ case 0x8e: ++ case 0x9e: + return CPU_SKYLAKE; + case 0x37: + case 0x4d: diff --git a/SPECS/oprofile.spec b/SPECS/oprofile.spec index f90d023..7d7cf85 100644 --- a/SPECS/oprofile.spec +++ b/SPECS/oprofile.spec @@ -1,7 +1,7 @@ Summary: System wide profiler Name: oprofile Version: 0.9.9 -Release: 16%{?dist} +Release: 20%{?dist} License: GPLv2+ and LGPLv2+ Group: Development/System # @@ -33,6 +33,11 @@ Patch900: oprofile-ppc64jvm.patch Patch1000: oprofile-skylake.patch Patch1001: oprofile-remap.patch Patch1002: oprofile-xml2.patch +Patch1003: oprofile-goldmont.patch +Patch1004: oprofile-bz1335145.patch +Patch1005: oprofile-bz1264443.patch +Patch1006: oprofile-captest.patch +Patch1007: oprofile-order.patch URL: http://oprofile.sf.net @@ -118,6 +123,11 @@ agent library. %patch1000 -p1 %patch1001 -p1 %patch1002 -p1 +%patch1003 -p1 +%patch1004 -p1 +%patch1005 -p1 -b .archive +%patch1006 -p1 -b .captest +%patch1007 -p1 -b .order ./autogen.sh @@ -204,6 +214,23 @@ exit 0 %{_sysconfdir}/ld.so.conf.d/* %changelog +* Tue Aug 9 2016 William Cohen - 0.9.9-20 +- Ensure that the perf events setup before ocount execs child. + +* Mon Aug 8 2016 William Cohen - 0.9.9-19 +- Allow operation /proc/sys/kernel/perf_event_paranoid == 2. + +* Wed Jul 6 2016 William Cohen - 0.9.9-18 +- Store profiling data with oparchive. + +* Thu May 12 2016 William Cohen - 0.9.9-17 +- Define some Intel broadwell default unit masks by names +- Add support for Harrisonville (Denverton SoC) +- Add support for Skylake-SP server +- Add support for Kabylake-U/Y +- Add support for Kabylake-H/S +- Make Nehalem, Westmere, and Haswell event names unique. + * Tue Aug 25 2015 William Cohen - 0.9.9-16 - Improved handling of remapped anonymous regions - Correct XML generation.