diff --git a/SOURCES/oprofile-coverity.patch b/SOURCES/oprofile-coverity.patch new file mode 100644 index 0000000..bdba88d --- /dev/null +++ b/SOURCES/oprofile-coverity.patch @@ -0,0 +1,320 @@ +commit be6d22999668ac976acd2008ec13db4385a0c8dd +Author: Maynard Johnson +Date: Mon Jan 27 15:44:18 2014 -0600 + + Fix issues detected by Coverity + + Will Cohen ran Coverity against oprofile and reported some issues + on Nov 20, 2013. I submitted the current oprofile source to the + Coverity webpage, and a couple new issues were detected. This + patch addresses most of these issues. Some issues are either + false positives from Coverity's analysis or have been marked + as "Intentional" so as to have Coverity ignore them. + + Signed-off-by: Maynard Johnson + +diff --git a/daemon/init.c b/daemon/init.c +index 2882c49..1fed812 100644 +--- a/daemon/init.c ++++ b/daemon/init.c +@@ -154,7 +154,7 @@ static void opd_do_jitdumps(void) + struct timeval tv; + char end_time_str[32]; + char opjitconv_path[PATH_MAX + 1]; +- char * exec_args[7]; ++ char * exec_args[8]; + + if (jit_conversion_running) + return; +@@ -175,6 +175,7 @@ static void opd_do_jitdumps(void) + if (vmisc) + exec_args[arg_num++] = "-d"; + exec_args[arg_num++] = "--delete-jitdumps"; ++ exec_args[arg_num++] = "--session-dir"; + exec_args[arg_num++] = session_dir; + exec_args[arg_num++] = start_time_str; + exec_args[arg_num++] = end_time_str; +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index aa0c1c5..0b7482f 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -487,7 +487,7 @@ handle_named_um: + (endptr <= (mask + strlen(mask) - 2))) { // '- 2' to account for linefeed and '\0' + + // Must be a default named unit mask +- strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN); ++ strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN - 1); + goto handle_named_um; + } + config |= ((event->evt_um & 0xFFULL) << 8); +diff --git a/libutil++/op_bfd.h b/libutil++/op_bfd.h +index 6ce71fa..1aa7e10 100644 +--- a/libutil++/op_bfd.h ++++ b/libutil++/op_bfd.h +@@ -334,8 +334,8 @@ private: + bfd_vma vma_adj; + + /** +- * The file descriptor for an image file that we pass to fdopen_bfd must be kep +- * open through the life of the op_bfd to enable proper beahvior of certain ++ * The file descriptor for an image file that we pass to fdopen_bfd must be kept ++ * open through the life of the op_bfd to enable proper behavior of certain + * BFD functions -- in particular, bfd_find_nearest_line(). + */ + int fd; +diff --git a/libutil++/op_spu_bfd.cpp b/libutil++/op_spu_bfd.cpp +index 4ac5245..29d6e06 100644 +--- a/libutil++/op_spu_bfd.cpp ++++ b/libutil++/op_spu_bfd.cpp +@@ -50,7 +50,7 @@ op_bfd::op_bfd(uint64_t spu_offset, string const & fname, + anon_obj(false), + vma_adj(0) + { +- int fd = -1; ++ fd = -1; + struct stat st; + int notes_remaining; + bool spu_note_found = false; +diff --git a/opjitconv/conversion.c b/opjitconv/conversion.c +index 111fe9d..add0f95 100644 +--- a/opjitconv/conversion.c ++++ b/opjitconv/conversion.c +@@ -39,10 +39,10 @@ static void free_jit_debug_line(void) + jitentry_debug_line_list = NULL; + } + +-int op_jit_convert(struct op_jitdump_info file_info, char const * elffile, ++int op_jit_convert(struct op_jitdump_info * file_info, char const * elffile, + unsigned long long start_time, unsigned long long end_time) + { +- void const * jitdump = file_info.dmp_file; ++ void const * jitdump = file_info->dmp_file; + int rc= OP_JIT_CONV_OK; + + entry_count = 0; +@@ -53,7 +53,7 @@ int op_jit_convert(struct op_jitdump_info file_info, char const * elffile, + jitentry_debug_line_list = NULL; + entries_symbols_ascending = entries_address_ascending = NULL; + +- if ((rc = parse_all(jitdump, jitdump + file_info.dmp_file_stat.st_size, ++ if ((rc = parse_all(jitdump, jitdump + file_info->dmp_file_stat.st_size, + end_time)) == OP_JIT_CONV_FAIL) + goto out; + +diff --git a/opjitconv/opjitconv.c b/opjitconv/opjitconv.c +index a9dfa91..9d910be 100644 +--- a/opjitconv/opjitconv.c ++++ b/opjitconv/opjitconv.c +@@ -19,6 +19,7 @@ + #include "op_file.h" + #include "op_libiberty.h" + ++#include + #include + #include + #include +@@ -75,6 +76,19 @@ int debug; + int non_root; + /* indicates we should delete jitdump files owned by the user */ + int delete_jitdumps; ++/* Session directory where sample data is stored */ ++char * session_dir; ++ ++static struct option long_options [] = { ++ { "session-dir", required_argument, NULL, 's'}, ++ { "debug", no_argument, NULL, 'd'}, ++ { "delete-jitdumps", no_argument, NULL, 'j'}, ++ { "non-root", no_argument, NULL, 'n'}, ++ { "help", no_argument, NULL, 'h'}, ++ { NULL, 9, NULL, 0} ++}; ++const char * short_options = "s:djnh"; ++ + LIST_HEAD(jitdump_deletion_candidates); + + /* +@@ -407,7 +421,7 @@ chk_proc_id: + goto free_res3; + } + /* Convert the dump file as the special user 'oprofile'. */ +- rc = op_jit_convert(dmp_info, tmp_elffile, start_time, end_time); ++ rc = op_jit_convert(&dmp_info, tmp_elffile, start_time, end_time); + if (rc < 0) + goto free_res3; + +@@ -772,61 +786,99 @@ static void _cleanup_jitdumps(void) + + } + +-int main(int argc, char ** argv) ++static void __print_usage(const char * extra_msg) ++{ ++ if (extra_msg) ++ fprintf(stderr, extra_msg); ++ fprintf(stderr, "usage: opjitconv [--debug | --non-root | --delete-jitdumps ] --session-dir= \n"); ++} ++ ++static int _process_args(int argc, char * const argv[]) ++{ ++ int keep_trying = 1; ++ int idx_of_non_options = 0; ++ setenv("POSIXLY_CORRECT", "1", 0); ++ while (keep_trying) { ++ int option_idx = 0; ++ int c = getopt_long(argc, argv, short_options, long_options, &option_idx); ++ switch (c) { ++ case -1: ++ if (optind != argc) { ++ idx_of_non_options = optind; ++ } ++ keep_trying = 0; ++ break; ++ case '?': ++ printf("non-option detected at optind %d\n", optind); ++ keep_trying = 0; ++ idx_of_non_options = -1; ++ break; ++ case 's': ++ session_dir = optarg; ++ break; ++ case 'd': ++ debug = 1; ++ break; ++ case 'n': ++ non_root = 1; ++ break; ++ case 'j': ++ delete_jitdumps = 1; ++ break; ++ case 'h': ++ break; ++ default: ++ break; ++ } ++ } ++ return idx_of_non_options; ++} ++ ++int main(int argc, char * const argv[]) + { + unsigned long long start_time, end_time; +- char session_dir[PATH_MAX]; +- int rc = 0; ++ struct stat filestat; ++ int non_options_idx, rc = 0; + size_t sessdir_len = 0; +- char * path_end; + + debug = 0; +- if (argc > 1 && strcmp(argv[1], "-d") == 0) { +- debug = 1; +- argc--; +- argv++; +- } + non_root = 0; +- if (argc > 1 && strcmp(argv[1], "--non-root") == 0) { +- non_root = 1; +- argc--; +- argv++; +- } +- + delete_jitdumps = 0; +- if (argc > 1 && strcmp(argv[1], "--delete-jitdumps") == 0) { +- delete_jitdumps = 1; +- argc--; +- argv++; +- } +- +- if (argc != 4) { +- printf("Usage: opjitconv [-d] " +- " \n"); ++ session_dir = NULL; ++ non_options_idx = _process_args(argc, argv); ++ // We need the session_dir and two non-option values passed -- starttime and endtime. ++ if (!session_dir || (non_options_idx != argc - 2)) { ++ __print_usage(NULL); + fflush(stdout); + rc = EXIT_FAILURE; + goto out; + } + + /* +- * Check for a maximum of 4096 bytes (Linux path name length limit) decremented +- * by 16 bytes (will be used later for appending samples sub directory). ++ * Check for a maximum of 4096 bytes (Linux path name length limit) minus 16 bytes ++ * (to be used later for appending samples sub directory) minus 1 (for terminator). + * Integer overflows according to the session dir parameter (user controlled) + * are not possible anymore. + */ +- path_end = memchr(argv[1], '\0', PATH_MAX); +- if (!path_end || ((sessdir_len = (path_end - argv[1])) >= PATH_MAX - 16)) { ++ if ((sessdir_len = strlen(session_dir)) >= (PATH_MAX - 17)) { + printf("opjitconv: Path name length limit exceeded for session directory\n"); + rc = EXIT_FAILURE; + goto out; + } +- memset(session_dir, '\0', PATH_MAX); +- assert(sessdir_len < (PATH_MAX - 16 - 1)); +- strncpy(session_dir, argv[1], sessdir_len); +- session_dir[PATH_MAX -1] = '\0'; + +- start_time = atol(argv[2]); +- end_time = atol(argv[3]); ++ if (stat(session_dir, &filestat)) { ++ perror("stat operation on passed session-dir failed"); ++ rc = EXIT_FAILURE; ++ goto out; ++ } ++ if (!S_ISDIR(filestat.st_mode)) { ++ printf("Passed session-dir %s is not a directory\n", session_dir); ++ rc = EXIT_FAILURE; ++ goto out; ++ } ++ ++ start_time = atol(argv[non_options_idx++]); ++ end_time = atol(argv[non_options_idx]); + + if (start_time > end_time) { + rc = EXIT_FAILURE; +diff --git a/opjitconv/opjitconv.h b/opjitconv/opjitconv.h +index f6243c9..a3ce37f 100644 +--- a/opjitconv/opjitconv.h ++++ b/opjitconv/opjitconv.h +@@ -99,7 +99,7 @@ int parse_all(void const * start, void const * end, + unsigned long long end_time); + + /* conversion.c */ +-int op_jit_convert(struct op_jitdump_info file_info, char const * elffile, ++int op_jit_convert(struct op_jitdump_info *file_info, char const * elffile, + unsigned long long start_time, unsigned long long end_time); + + /* create_bfd.c */ +diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp +index 88aed3d..399308f 100644 +--- a/pe_profiling/operf.cpp ++++ b/pe_profiling/operf.cpp +@@ -787,7 +787,7 @@ static void _do_jitdump_convert() + struct timeval tv; + char end_time_str[32]; + char opjitconv_path[PATH_MAX + 1]; +- char * exec_args[8]; ++ char * exec_args[9]; + + jitconv_pid = fork(); + switch (jitconv_pid) { +@@ -799,6 +799,7 @@ static void _do_jitdump_convert() + const char * debug_option = "-d"; + const char * non_root_user = "--non-root"; + const char * delete_jitdumps = "--delete-jitdumps"; ++ const char * sess_dir = "--session-dir"; + gettimeofday(&tv, NULL); + end_time = tv.tv_sec; + sprintf(end_time_str, "%llu", end_time); +@@ -810,6 +811,7 @@ static void _do_jitdump_convert() + if (my_uid != 0) + exec_args[arg_num++] = (char *)non_root_user; + exec_args[arg_num++] = (char *)delete_jitdumps; ++ exec_args[arg_num++] = (char *)sess_dir; + exec_args[arg_num++] = (char *)operf_options::session_dir.c_str(); + exec_args[arg_num++] = start_time_str; + exec_args[arg_num++] = end_time_str; diff --git a/SOURCES/oprofile-defaultmask.patch b/SOURCES/oprofile-defaultmask.patch new file mode 100644 index 0000000..ecd71ac --- /dev/null +++ b/SOURCES/oprofile-defaultmask.patch @@ -0,0 +1,33 @@ +commit fb9529161039e96d44b4b7396450cff04e3d9aa8 +Author: Maynard Johnson +Date: Tue Oct 15 14:58:16 2013 -0500 + + Fix operf/ocount default unit mask selection + + Many events (particularly in the x86* architectures) + require a unit mask value to specify the exact event + type. For such events, a default unit mask value + is assigned. When a user runs operf, ocount, or + opcontrol and specifies such an event but does not + specify a unit mask, the default unit mask should be + selected and used by the tool. A bug was discovered + with operf and ocount where the unit mask value in + this situation was being set to '0' instead of the + default unit mask value. This patch fixes the bug. + + Signed-off-by: Maynard Johnson + +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index b85d175..177835e 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -484,7 +484,8 @@ handle_named_um: + pclose(fp); + event->evt_um = strtoull(mask, &endptr, 10); + if ((endptr >= mask) && +- (endptr <= (mask + strlen(mask) - 1))) { ++ (endptr <= (mask + strlen(mask) - 2))) { // '- 2' to account for linefeed and '\0' ++ + // Must be a default named unit mask + strncpy(event->um_name, mask, OP_MAX_UM_NAME_LEN); + goto handle_named_um; diff --git a/SOURCES/oprofile-env.patch b/SOURCES/oprofile-env.patch new file mode 100644 index 0000000..1b81a13 --- /dev/null +++ b/SOURCES/oprofile-env.patch @@ -0,0 +1,71 @@ +From b869a61861e161c855379c4b5700fd352da01154 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Thu, 8 Jan 2015 16:37:57 -0500 +Subject: [PATCH] Avoid permanently setting POSIXLY_CORRECT environment + variable + +During testing on Fedora it was discovered that operf was setting the +enviroment variable POSIXLY_CORRECT and this could potentially be +observed in the children tasks that operf starts (Red Hat Bugzilla +1178577). The operf, ocount, and opjitconv commands all ensure that +POSIXLY_CORRECT environment variable is set when the options are +processed with getopt_long, but they never unset the variable +afterwards. This patch ensures that POSIXLY_CORRECT is as it was +before it was set. + +Signed-off-by: William Cohen +--- + opjitconv/opjitconv.c | 5 +++++ + pe_counting/ocount.cpp | 5 +++++ + pe_profiling/operf.cpp | 5 +++++ + 3 files changed, 15 insertions(+) + +diff --git a/pe_counting/ocount.cpp b/pe_counting/ocount.cpp +index 07dfd0c..f7caede 100644 +--- a/pe_counting/ocount.cpp ++++ b/pe_counting/ocount.cpp +@@ -579,6 +579,7 @@ static int _process_ocount_and_app_args(int argc, char * const argv[]) + { + bool keep_trying = true; + int idx_of_non_options = 0; ++ char * prev_env = getenv("POSIXLY_CORRECT"); + setenv("POSIXLY_CORRECT", "1", 0); + while (keep_trying) { + int option_idx = 0; +@@ -663,6 +664,10 @@ static int _process_ocount_and_app_args(int argc, char * const argv[]) + __print_usage_and_exit("ocount: unexpected end of arg parsing"); + } + } ++ ++ if (prev_env == NULL) ++ unsetenv("POSIXLY_CORRECT"); ++ + return idx_of_non_options; + } + +diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp +index 04a25d9..a186278 100644 +--- a/pe_profiling/operf.cpp ++++ b/pe_profiling/operf.cpp +@@ -1258,6 +1258,7 @@ static int _process_operf_and_app_args(int argc, char * const argv[]) + { + bool keep_trying = true; + int idx_of_non_options = 0; ++ char * prev_env = getenv("POSIXLY_CORRECT"); + setenv("POSIXLY_CORRECT", "1", 0); + while (keep_trying) { + int option_idx = 0; +@@ -1331,6 +1332,10 @@ static int _process_operf_and_app_args(int argc, char * const argv[]) + __print_usage_and_exit("unexpected end of arg parsing"); + } + } ++ ++ if (prev_env == NULL) ++ unsetenv("POSIXLY_CORRECT"); ++ + return idx_of_non_options; + } + +-- +2.1.0 + diff --git a/SOURCES/oprofile-extramask.patch b/SOURCES/oprofile-extramask.patch new file mode 100644 index 0000000..6863288 --- /dev/null +++ b/SOURCES/oprofile-extramask.patch @@ -0,0 +1,97 @@ +From dd433306f249db81f1ef5cfffefeb2d0ad4e3115 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Tue, 10 Mar 2015 10:52:39 -0400 +Subject: [PATCH] Ensure that umask is set if the extra bits (edge, inv, cmask) + are used + +When testing ocount on some of the Intel processor it was discovered +that that the umask not not being set for events that specified the +the extra bits. Below is an example of the problem on an Intel Ivy +Bridge processor with the event code missing the 0x03 unit masks for +the events: + +$ ocount --verbose -e int_misc:recovery_cycles -e int_misc:recovery_stalls_count ls +Final event code is 140000d +Final event code is 144000d +Number of events passed is 2 +Exec args are: ls +telling child to start app +parent says start app /usr/bin/ls +calling perf_event_open for pid 240d +perf_event_open returning fd 9 +perf_event_open returning fd a +perf counter setup complete +app 240d is running +going into waitpid on monitored app 240d +app process ended normally. +Reading counter data for event int_misc +Reading counter data for event int_misc + +Events were actively counted for 1070382 nanoseconds. +Event counts (actual) for /usr/bin/ls: + Event Count % time counted + int_misc:recovery_cycles 0 100.00 + int_misc:recovery_stalls_count 0 100.00 + +With this patch the umasks are included and the example executes correctly: + +$ ocount --verbose -e int_misc:recovery_cycles -e int_misc:recovery_stalls_count ls +Final event code is 140030d +Final event code is 144030d +Number of events passed is 2 +Exec args are: ls +telling child to start app +calling perf_event_open for pid 72e1 +parent says start app /usr/bin/ls +perf_event_open returning fd 9 +perf_event_open returning fd a +perf counter setup complete +app 72e1 is running +going into waitpid on monitored app 72e1 +app process ended normally. +Reading counter data for event int_misc +Reading counter data for event int_misc + +Events were actively counted for 1216948 nanoseconds. +Event counts (actual) for /usr/bin/ls: + Event Count % time counted + int_misc:recovery_cycles 69,730 100.00 + int_misc:recovery_stalls_count 14,800 100.00 + +Signed-off-by: William Cohen +--- + libop/op_events.c | 3 +++ + libop/op_events.h | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/libop/op_events.c b/libop/op_events.c +index 99266c6..2badc8e 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -238,6 +238,9 @@ static void parse_um_entry(struct op_described_um * entry, char const * line) + if (strisprefix(c, "extra:")) { + c += 6; + entry->extra = parse_extra(c); ++ /* include the regular umask if there are real extra bits */ ++ if (entry->extra != EXTRA_NONE) ++ entry->extra |= (entry->value & UMASK_MASK) << UMASK_SHIFT; + /* named mask */ + c = skip_nonws(c); + c = skip_ws(c); +diff --git a/libop/op_events.h b/libop/op_events.h +index ec345e5..f09c830 100644 +--- a/libop/op_events.h ++++ b/libop/op_events.h +@@ -20,6 +20,9 @@ extern "C" { + #include "op_types.h" + #include "op_list.h" + ++#define UMASK_SHIFT 8 ++#define UMASK_MASK 0xff ++ + #define EXTRA_EDGE (1U << 18) + #define EXTRA_MIN_VAL EXTRA_EDGE + +-- +2.1.0 + diff --git a/SOURCES/oprofile-hugepage.patch b/SOURCES/oprofile-hugepage.patch new file mode 100644 index 0000000..111ba1d --- /dev/null +++ b/SOURCES/oprofile-hugepage.patch @@ -0,0 +1,34 @@ +From 0246c6ba4a08378c46c17617d831d6baf0f44989 Mon Sep 17 00:00:00 2001 +From: William Cohen +Date: Fri, 9 Jan 2015 16:44:09 -0500 +Subject: [PATCH] Allow operf to track anon_hugepage mmap entries + +The perf mmap information for anon_huge pages has a different filename +("/anon_hugepage") than the mmap information for regions composed of +normal sized pages ("//anon"). This results in opreport not being +able to map samples collected by operf to Java methods when the Java +VM uses statically allocated huge pages (rhbz1180512 and rhbz1180513). + +Signed-off-by: William Cohen +--- + libperf_events/operf_utils.cpp | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp +index a87524b..90a0765 100644 +--- a/libperf_events/operf_utils.cpp ++++ b/libperf_events/operf_utils.cpp +@@ -295,6 +295,10 @@ static void __handle_mmap_event(event_t * event) + strlen("//anon")) == 0)) { + mapping->is_anon_mapping = true; + strcpy(mapping->filename, "anon"); ++ } else if ((strncmp(mapping->filename, "/anon_hugepage", ++ strlen("/anon_hugepage")) == 0)) { ++ mapping->is_anon_mapping = true; ++ strcpy(mapping->filename, "anon"); + } + mapping->end_addr = (event->mmap.len == 0ULL)? 0ULL : mapping->start_addr + event->mmap.len - 1; + mapping->pgoff = event->mmap.pgoff; +-- +2.1.0 + diff --git a/SOURCES/oprofile-intelcpuid.patch b/SOURCES/oprofile-intelcpuid.patch new file mode 100644 index 0000000..3bb0201 --- /dev/null +++ b/SOURCES/oprofile-intelcpuid.patch @@ -0,0 +1,27 @@ +commit a154a6ba3477c9cb51e2c225e6434909bb41a60a +Author: Andi Kleen +Date: Thu Jun 11 13:54:59 2015 -0700 + + oprofile: Add Intel Airmont and Intel Xeon D model numbers + + Add a model number for Airmont/Braswell CPUs + Add a model number for Broadwell based Xeon D CPUs. + + Signed-off-by: Andi Kleen + +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 1d39692..8a7ed1c 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -152,9 +152,11 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x3d: + case 0x47: + case 0x4f: ++ case 0x56: + return CPU_BROADWELL; + case 0x37: + case 0x4d: ++ case 0x4c: + return CPU_SILVERMONT; + } + } diff --git a/SOURCES/oprofile-maskarray.patch b/SOURCES/oprofile-maskarray.patch new file mode 100644 index 0000000..2851b63 --- /dev/null +++ b/SOURCES/oprofile-maskarray.patch @@ -0,0 +1,33 @@ +commit ef501aa609f49c06df9f33c9a7330dffd71b31b3 +Author: Maynard Johnson +Date: Thu Oct 31 11:11:06 2013 -0500 + + Fix handling of default named unit masks longer than 11 chars + + The handling of default unit masks that are names instead of hex + values is new with oprofile 0.9.9. I've discovered a bug in this + handling when the name exceeds 11 characters. For example, on + Sandybridge, the following ocount command fails: + + [mpjohn@oc1757000783 test-stuff]$ ocount -e l1d_blocks ls + Cannot find unit mask bank_confli for l1d_blocks + Unable to find unit mask info for bank_confli for event l1d_blocks + + This problem was due to the char array ('mask') being too small. + + Signed-off-by: Maynard Johnson + +diff --git a/libpe_utils/op_pe_utils.cpp b/libpe_utils/op_pe_utils.cpp +index 177835e..9e2addb 100644 +--- a/libpe_utils/op_pe_utils.cpp ++++ b/libpe_utils/op_pe_utils.cpp +@@ -413,8 +413,8 @@ static void _get_event_code(operf_event_t * event, op_cpu cpu_type) + + + #if defined(__i386__) || defined(__x86_64__) ++ char mask[OP_MAX_UM_NAME_LEN]; + // Setup EventSelct[11:8] field for AMD +- char mask[12]; + const char * vendor_AMD = "AuthenticAMD"; + if (op_is_cpu_vendor((char *)vendor_AMD)) { + config = base_code & 0xF00ULL; diff --git a/SOURCES/oprofile-ppc64jvm.patch b/SOURCES/oprofile-ppc64jvm.patch new file mode 100644 index 0000000..e870335 --- /dev/null +++ b/SOURCES/oprofile-ppc64jvm.patch @@ -0,0 +1,63 @@ +commit a41b4231ccfc83fb99271507a8e98f84a348e71d +Author: Rei Odaira +Date: Fri May 22 15:34:50 2015 -0400 + + Filter out zero-sized mapping to avoid opjitconv running indefinitely + + I found opjitconv ran indefinitely when profiling a Java application + running on OpenJDK/ppc64le. This is because OpenJDK sometimes reports + generation of zero-size jitted code via JVMTI, but scan_overlaps() in + opjitconv does not assume the existence of jitted code with size zero. + + (1) scan_overlaps() finds overlap between a normal jitted code and a + zero-size jitted code. + (2) eliminate_overlaps() tries to split the zero-size jitted code but + cannot. + (3) resolve_overlaps() incorrectly thinks the split has happened and + invokes scan_overlaps() again. + (4) Back to (1) + + One solution is to remove all the zero-size entries before resolving + overlaps which is implemented by this patch. + + Signed-off-by: William Cohen + +diff --git a/opjitconv/jitsymbol.c b/opjitconv/jitsymbol.c +index e2b1e66..1b980af 100644 +--- a/opjitconv/jitsymbol.c ++++ b/opjitconv/jitsymbol.c +@@ -201,6 +201,26 @@ static void invalidate_earlybirds(unsigned long long start_time) + } + } + ++static void invalidate_zero_size_entries(void) ++{ ++ u32 i; ++ int flag; ++ struct jitentry * a; ++ ++ flag = 0; ++ for (i = 0; i < entry_count; i++) { ++ a = entries_address_ascending[i]; ++ if (a->code_size == 0) { ++ invalidate_entry(a); ++ flag = 1; ++ } ++ } ++ if (flag) { ++ resort_address(); ++ resort_symbol(); ++ } ++} ++ + + /* select the symbol with the longest life time in the index range */ + static int select_one(int start_idx, int end_idx) +@@ -505,6 +525,7 @@ int resolve_overlaps(unsigned long long start_time) + int cnt = 0; + + invalidate_earlybirds(start_time); ++ invalidate_zero_size_entries(); + while ((rc = scan_overlaps()) && rc != OP_JIT_CONV_FAIL) { + resort_address(); + if (cnt == 0) { diff --git a/SOURCES/oprofile-remap.patch b/SOURCES/oprofile-remap.patch new file mode 100644 index 0000000..1252d79 --- /dev/null +++ b/SOURCES/oprofile-remap.patch @@ -0,0 +1,60 @@ +commit 1c54c9a3d96dd8d9d1d579baaeabc94d0f923ee8 +Author: William Cohen +Date: Fri Jul 10 15:41:33 2015 -0400 + + Improve handling of remapped anon regions across processes + + Java runtime environments use dynamically allocated memory in + anonymous regions to store Just-In-Time translated code. The Java + runtime system may change the access permissions for portions of mmap + regions during execution and operf needs to be tolerant of those + change to a portion of the mmap. operf also needs to keep the anon + memory maps distinct between processes to avoid confusion about the + sizes of the memory regions. + + Signed-off-by: William Cohen + +diff --git a/libperf_events/operf_process_info.h b/libperf_events/operf_process_info.h +index f98591f..3138ffb 100644 +--- a/libperf_events/operf_process_info.h ++++ b/libperf_events/operf_process_info.h +@@ -25,6 +25,7 @@ struct operf_mmap { + u64 start_addr; + u64 end_addr; + u64 pgoff; ++ u32 pid; + bool is_anon_mapping; + bool is_hypervisor; + char filename[PATH_MAX]; +diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp +index 90a0765..ff972d4 100644 +--- a/libperf_events/operf_utils.cpp ++++ b/libperf_events/operf_utils.cpp +@@ -275,7 +275,10 @@ static void __handle_mmap_event(event_t * event) + range = all_images_map.equal_range(image_basename); + for (it = range.first; it != range.second; it++) { + if (((strcmp((*it).second->filename, image_basename.c_str())) == 0) +- && ((*it).second->start_addr == event->mmap.start)) { ++ && ((*it).second->pid == 0 || (*it).second->pid == event->mmap.pid) ++ && ((*it).second->start_addr <= event->mmap.start ++ && ((*it).second->end_addr >= event->mmap.start + event->mmap.len))) ++ { + mapping = (*it).second; + break; + } +@@ -291,12 +294,15 @@ static void __handle_mmap_event(event_t * event) + */ + if (mapping->filename[0] == '[') { + mapping->is_anon_mapping = true; ++ mapping->pid = event->mmap.pid; + } else if ((strncmp(mapping->filename, "//anon", + strlen("//anon")) == 0)) { + mapping->is_anon_mapping = true; ++ mapping->pid = event->mmap.pid; + strcpy(mapping->filename, "anon"); + } else if ((strncmp(mapping->filename, "/anon_hugepage", + strlen("/anon_hugepage")) == 0)) { ++ mapping->pid = event->mmap.pid; + mapping->is_anon_mapping = true; + strcpy(mapping->filename, "anon"); + } diff --git a/SOURCES/oprofile-skylake.patch b/SOURCES/oprofile-skylake.patch new file mode 100644 index 0000000..b433452 --- /dev/null +++ b/SOURCES/oprofile-skylake.patch @@ -0,0 +1,675 @@ +From 917dfab881becfad104ad02682a88afb54284932 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Wed, 1 Jul 2015 14:36:42 -0700 +Subject: [PATCH 1/3] Add support for Intel Skylake events + +Add support for the Intel Skylake micro architecture to oprofile. + +OFFCORE_* and FRONTEND_* events are not supported for now because +oprofile does not support setting up config1 + +Signed-off-by: Andi Kleen +--- + events/Makefile.am | 1 + + events/i386/skylake/events | 62 ++++++++ + events/i386/skylake/unit_masks | 314 +++++++++++++++++++++++++++++++++++++++++ + libop/op_cpu_type.c | 2 + + libop/op_cpu_type.h | 1 + + libop/op_events.c | 1 + + libop/op_hw_specific.h | 3 + + utils/ophelp.c | 1 + + 8 files changed, 385 insertions(+) + create mode 100644 events/i386/skylake/events + create mode 100644 events/i386/skylake/unit_masks + +diff --git a/events/Makefile.am b/events/Makefile.am +index d68f0e8..56f9020 100644 +--- a/events/Makefile.am ++++ b/events/Makefile.am +@@ -18,6 +18,7 @@ event_files = \ + i386/ivybridge/events i386/ivybridge/unit_masks \ + i386/haswell/events i386/haswell/unit_masks \ + i386/broadwell/events i386/broadwell/unit_masks \ ++ i386/skylake/events i386/skylake/unit_masks \ + i386/silvermont/events i386/silvermont/unit_masks \ + ia64/ia64/events ia64/ia64/unit_masks \ + ia64/itanium2/events ia64/itanium2/unit_masks \ +diff --git a/events/i386/skylake/events b/events/i386/skylake/events +new file mode 100644 +index 0000000..28d6654 +--- /dev/null ++++ b/events/i386/skylake/events +@@ -0,0 +1,62 @@ ++# ++# Intel "Skylake" microarchitecture core events. ++# ++# See http://ark.intel.com/ for help in identifying Skylake based CPUs ++# ++# Note the minimum counts are not discovered experimentally and could be likely ++# lowered in many cases without ill effect. ++# ++event:0x00 counters:1 um:inst_retired minimum:2000003 name:inst_retired : ++event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : ++event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : ++event:0x07 counters:cpuid um:ld_blocks_partial minimum:100003 name:ld_blocks_partial_address_alias : ++event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : ++event:0x0d counters:cpuid um:int_misc minimum:2000003 name:int_misc : ++event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : ++event:0x14 counters:cpuid um:arith minimum:2000003 name:arith_divider_active : ++event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : ++event:0x2e counters:cpuid um:longest_lat_cache minimum:100003 name:longest_lat_cache : ++event:0x3c counters:cpuid um:cpu_clk_thread_unhalted minimum:2000003 name:cpu_clk_thread_unhalted : ++event:0x48 counters:cpuid um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : ++event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000003 name:dtlb_store_misses : ++event:0x4c counters:cpuid um:load_hit_pre minimum:100003 name:load_hit_pre_sw_pf : ++event:0x4f counters:cpuid um:ept minimum:2000003 name:ept_walk_pending : ++event:0x51 counters:cpuid um:l1d minimum:2000003 name:l1d_replacement : ++event:0x54 counters:cpuid um:tx_mem minimum:2000003 name:tx_mem : ++event:0x5d counters:cpuid um:tx_exec minimum:2000003 name:tx_exec : ++event:0x5e counters:cpuid um:rs_events minimum:2000003 name:rs_events : ++event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000003 name:offcore_requests_outstanding : ++event:0x63 counters:cpuid um:lock_cycles minimum:2000003 name:lock_cycles_cache_lock_duration : ++event:0x79 counters:cpuid um:idq minimum:2000003 name:idq : ++event:0x80 counters:cpuid um:icache_16b minimum:2000003 name:icache_16b_ifdata_stall : ++event:0x83 counters:cpuid um:icache_64b minimum:200003 name:icache_64b : ++event:0x85 counters:cpuid um:itlb_misses minimum:100003 name:itlb_misses : ++event:0x87 counters:cpuid um:ild_stall minimum:2000003 name:ild_stall_lcp : ++event:0x9c counters:cpuid um:idq_uops_not_delivered minimum:2000003 name:idq_uops_not_delivered : ++event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000003 name:uops_dispatched_port : ++event:0xa2 counters:cpuid um:resource_stalls minimum:2000003 name:resource_stalls : ++event:0xa3 counters:cpuid um:cycle_activity minimum:2000003 name:cycle_activity : ++event:0xa6 counters:cpuid um:exe_activity minimum:2000003 name:exe_activity : ++event:0xa8 counters:cpuid um:lsd minimum:2000003 name:lsd : ++event:0xab counters:cpuid um:dsb2mite_switches minimum:2000003 name:dsb2mite_switches_penalty_cycles : ++event:0xae counters:cpuid um:itlb minimum:100007 name:itlb_itlb_flush : ++event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_requests : ++event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : ++event:0xb2 counters:cpuid um:offcore_requests_buffer minimum:2000003 name:offcore_requests_buffer_sq_full : ++event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : ++event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists_any : ++event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : ++event:0xc3 counters:cpuid um:machine_clears minimum:100003 name:machine_clears : ++event:0xc4 counters:cpuid um:br_inst_retired minimum:400009 name:br_inst_retired : ++event:0xc5 counters:cpuid um:br_misp_retired minimum:400009 name:br_misp_retired : ++event:0xc7 counters:cpuid um:fp_arith_inst_retired minimum:2000003 name:fp_arith_inst_retired : ++event:0xc8 counters:cpuid um:hle_retired minimum:2000003 name:hle_retired : ++event:0xc9 counters:cpuid um:rtm_retired minimum:2000003 name:rtm_retired : ++event:0xca counters:cpuid um:fp_assist minimum:100003 name:fp_assist_any : ++event:0xcb counters:cpuid um:hw_interrupts minimum:100003 name:hw_interrupts_received : ++event:0xd0 counters:0,1,2,3 um:mem_inst_retired minimum:2000003 name:mem_inst_retired : ++event:0xd1 counters:0,1,2,3 um:mem_load_retired minimum:2000003 name:mem_load_retired : ++event:0xd2 counters:0,1,2,3 um:mem_load_l3_hit_retired minimum:100003 name:mem_load_l3_hit_retired : ++event:0xe6 counters:cpuid um:baclears minimum:100003 name:baclears_any : ++event:0xf0 counters:cpuid um:l2_trans minimum:200003 name:l2_trans_l2_wb : ++event:0xf1 counters:cpuid um:l2_lines_in minimum:100003 name:l2_lines_in_all : +diff --git a/events/i386/skylake/unit_masks b/events/i386/skylake/unit_masks +new file mode 100644 +index 0000000..98ed65c +--- /dev/null ++++ b/events/i386/skylake/unit_masks +@@ -0,0 +1,314 @@ ++# ++# Unit masks for the Intel "Skylake" micro architecture ++# ++# See http://ark.intel.com/ for help in identifying Skylake based CPUs ++# ++name:arith type:mandatory default:0x1 ++ 0x1 extra:cmask=1,edge divider_active Cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. ++name:baclears type:mandatory default:0x1 ++ 0x1 extra: any Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end. ++name:dsb2mite_switches type:mandatory default:0x2 ++ 0x2 extra: penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. ++name:ept type:mandatory default:0x10 ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a EPT (Extended Page Table) walk for any request type. ++name:fp_assist type:mandatory default:0x1e ++ 0x1e extra:cmask=1 any Cycles with any input/output SSE or FP assist ++name:hw_interrupts type:mandatory default:0x1 ++ 0x1 extra: received Number of hardware interrupts received by the processor. ++name:icache_16b type:mandatory default:0x4 ++ 0x4 extra: ifdata_stall Cycles where a code fetch is stalled due to L1 instruction cache miss. ++name:ild_stall type:mandatory default:0x1 ++ 0x1 extra: lcp Stalls caused by changing prefix length of the instruction. ++name:itlb type:mandatory default:0x1 ++ 0x1 extra: itlb_flush Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages. ++name:l1d type:mandatory default:0x1 ++ 0x1 extra: replacement L1D data line replacements ++name:l2_lines_in type:mandatory default:0x7 ++ 0x7 extra: all L2 cache lines filling L2 ++name:l2_trans type:mandatory default:0x40 ++ 0x40 extra: l2_wb L2 writebacks that access L2 cache ++name:ld_blocks_partial type:mandatory default:0x1 ++ 0x1 extra: address_alias False dependencies in MOB due to partial compare on address. ++name:load_hit_pre type:mandatory default:0x1 ++ 0x1 extra: sw_pf Demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch. ++name:lock_cycles type:mandatory default:0x2 ++ 0x2 extra: cache_lock_duration Cycles when L1D is locked ++name:offcore_requests_buffer type:mandatory default:0x1 ++ 0x1 extra: sq_full Offcore requests buffer cannot take more entries for this thread core. ++name:other_assists type:mandatory default:0x3f ++ 0x3f extra: any Number of times a microcode assist is invoked by HW other than FP-assist. Examples include AD (page Access Dirty) and AVX* related assists. ++name:inst_retired type:exclusive default:any ++ 0x1 extra: any Instructions retired from execution.mem ++ 0x0 extra: any_p Number of instructions retired. General Counter - architectural event ++ 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution ++name:cpu_clk_unhalted type:exclusive default:thread ++ 0x2 extra: thread Core cycles when the thread is not in halt state ++ 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. ++ 0x0 extra: thread_p Thread cycles when thread is not in halt state ++ 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state ++ 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state ++name:ld_blocks type:exclusive default:0x2 ++ 0x2 extra: store_forward loads blocked by overlapping with store buffer that cannot be forwarded . ++ 0x8 extra: no_sr The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use ++name:dtlb_load_misses type:exclusive default:miss_causes_a_walk ++ 0x1 extra: miss_causes_a_walk Load misses in all DTLB levels that cause page walks ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a page walk for a load. ++ 0x20 extra: stlb_hit Loads that miss the DTLB and hit the STLB. ++ 0xe extra: walk_completed Load miss in all TLB levels causes a page walk that completes. (All page sizes) ++ 0x10 extra:cmask=1 walk_active Cycles when at least one PMH is busy with a page walk for a load. ++name:int_misc type:exclusive default:recovery_cycles ++ 0x1 extra: recovery_cycles Core cycles the allocator was stalled due to recovery from earlier clear event for this thread (e.g. misprediction or memory nuke) ++ 0x80 extra: clear_resteer_cycles Cycles the issue-stage is waiting for front-end to fetch from resteered path following branch misprediction or machine clear events. ++ 0x1 extra:any recovery_cycles_any Core cycles the allocator was stalled due to recovery from earlier clear event for any thread running on the physical core (e.g. misprediction or memory nuke) ++name:uops_issued type:exclusive default:any ++ 0x1 extra: any Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS) ++ 0x20 extra: slow_lea Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not. ++ 0x1 extra:cmask=1,inv stall_cycles Cycles when Resource Allocation Table (RAT) does not issue Uops to Reservation Station (RS) for the thread ++ 0x2 extra: vector_width_mismatch This event counts the number of Blend Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS) in order to preserve upper bits of vector registers. Starting the Skylake microarchitecture, these Blend uops are needed since every Intel SSE instruction executed in Dirty Upper State needs to preserve bits 128-255 of the destination register. For more information, refer to ?Mixing Intel AVX and Intel SSE Code? section of the Optimization Guide. ++name:l2_rqsts type:exclusive default:0x21 ++ 0x21 extra: demand_data_rd_miss Demand Data Read miss L2, no rejects ++ 0x41 extra: demand_data_rd_hit Demand Data Read requests that hit L2 cache ++ 0xe1 extra: all_demand_data_rd Demand Data Read requests ++ 0xe2 extra: all_rfo RFO requests to L2 cache ++ 0xe4 extra: all_code_rd L2 code requests ++ 0xf8 extra: all_pf Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches ++ 0x38 extra: pf_miss Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that miss L2 cache ++ 0xd8 extra: pf_hit Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that hit L2 cache ++ 0x42 extra: rfo_hit RFO requests that hit L2 cache ++ 0x22 extra: rfo_miss RFO requests that miss L2 cache ++ 0x44 extra: code_rd_hit L2 cache hits when fetching instructions, code reads. ++ 0x24 extra: code_rd_miss L2 cache misses when fetching instructions ++ 0x27 extra: all_demand_miss Demand requests that miss L2 cache ++ 0xe7 extra: all_demand_references Demand requests to L2 cache ++ 0x3f extra: miss All requests that miss L2 cache ++ 0xff extra: references All L2 requests ++name:longest_lat_cache type:exclusive default:0x41 ++ 0x41 extra: miss Core-originated cacheable demand requests missed L3 ++ 0x4f extra: reference Core-originated cacheable demand requests that refer to L3 ++name:cpu_clk_thread_unhalted type:exclusive default:ref_xclk ++ 0x1 extra: ref_xclk Reference cycles when the thread is unhalted (counts at 100 MHz rate) ++ 0x2 extra: one_thread_active Count XClk pulses when this thread is unhalted and the other thread is halted. ++ 0x1 extra:any ref_xclk_any Reference cycles when the at least one thread on the physical core is unhalted (counts at 100 MHz rate) ++name:l1d_pend_miss type:exclusive default:pending ++ 0x1 extra: pending L1D miss oustandings duration in cycles ++ 0x2 extra: fb_full Number of times a request needed a FB entry but there was no entry available for it. That is the FB unavailability was dominant reason for blocking the request. A request includes cacheable/uncacheable demands that is load, store or SW prefetch. HWP are e ++ 0x1 extra:cmask=1 pending_cycles Cycles with L1D load Misses outstanding. ++ 0x1 extra:cmask=1,any pending_cycles_any Cycles with L1D load Misses outstanding from any thread on physical core ++name:dtlb_store_misses type:exclusive default:miss_causes_a_walk ++ 0x1 extra: miss_causes_a_walk Store misses in all DTLB levels that cause page walks ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a page walk for a store. ++ 0x20 extra: stlb_hit Stores that miss the DTLB and hit the STLB. ++ 0xe extra: walk_completed Store misses in all TLB levels causes a page walk that completes. (All page sizes) ++ 0x10 extra:cmask=1 walk_active Cycles when at least one PMH is busy with a page walk for a store. ++name:tx_mem type:exclusive default:0x1 ++ 0x1 extra: abort_conflict Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address ++ 0x2 extra: abort_capacity Number of times a transactional abort was signaled due to a data capacity limitation for transactional reads or writes. ++ 0x4 extra: abort_hle_store_to_elided_lock Number of times a HLE transactional region aborted due to a non XRELEASE prefixed instruction writing to an elided lock in the elision buffer ++ 0x8 extra: abort_hle_elision_buffer_not_empty Number of times an HLE transactional execution aborted due to NoAllocatedElisionBuffer being non-zero. ++ 0x10 extra: abort_hle_elision_buffer_mismatch Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer ++ 0x20 extra: abort_hle_elision_buffer_unsupported_alignment Number of times an HLE transactional execution aborted due to an unsupported read alignment from the elision buffer. ++ 0x40 extra: hle_elision_buffer_full Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero. ++name:tx_exec type:exclusive default:0x1 ++ 0x1 extra: misc1 Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort. ++ 0x2 extra: misc2 Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region ++ 0x4 extra: misc3 Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded ++ 0x8 extra: misc4 Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region. ++ 0x10 extra: misc5 Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region ++name:rs_events type:exclusive default:empty_cycles ++ 0x1 extra: empty_cycles Cycles when Reservation Station (RS) is empty for the thread ++ 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. ++name:offcore_requests_outstanding type:exclusive default:demand_data_rd ++ 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. ++ 0x2 extra:cmask=1 demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. ++ 0x4 extra:cmask=1 demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x8 extra: all_data_rd Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore ++ 0x10 extra: l3_miss_demand_data_rd Counts number of Offcore outstanding Demand Data Read requests who miss L3 cache in the superQ every cycle. ++ 0x1 extra:cmask=1 cycles_with_demand_data_rd Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore ++ 0x8 extra:cmask=1 cycles_with_data_rd Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore ++ 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. ++ 0x4 extra:cmask=1 cycles_with_demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x10 extra:cmask=1 cycles_with_l3_miss_demand_data_rd Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ ++ 0x10 extra:cmask=6 l3_miss_demand_data_rd_ge_6 Cycles with at least 6 Demand Data Read requests who miss L3 cache in the superQ ++ 0x1 extra:cmask=6 demand_data_rd_ge_6 Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue ++name:idq type:exclusive default:mite_uops ++ 0x4 extra: mite_uops Uops delivered to Instruction Decode Queue (IDQ) from MITE path ++ 0x8 extra: dsb_uops Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path ++ 0x20 extra: ms_mite_uops Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x30 extra:cmask=1 ms_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x4 extra:cmask=1 mite_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path ++ 0x8 extra:cmask=1 dsb_cycles Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path ++ 0x10 extra:cmask=1 ms_dsb_cycles Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++ 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops ++ 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering any Uop ++ 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops ++ 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering any Uop ++ 0x30 extra:cmask=1,edge ms_switches Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer ++ 0x30 extra: ms_uops Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy ++name:icache_64b type:exclusive default:0x1 ++ 0x1 extra: iftag_hit Instruction fetch tag lookups that hit in the instruction cache (L1I). Counts at 64-byte cache-line granularity. ++ 0x2 extra: iftag_miss Instruction fetch tag lookups that miss in the instruction cache (L1I). Counts at 64-byte cache-line granularity. ++ 0x4 extra: iftag_stall Cycles where a code fetch is stalled due to L1 instruction cache tag miss. ++name:itlb_misses type:exclusive default:0x1 ++ 0x1 extra: miss_causes_a_walk Misses at all ITLB levels that cause page walks ++ 0x10 extra: walk_pending Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. ++ 0x20 extra: stlb_hit Intruction fetch requests that miss the ITLB and hit the STLB. ++ 0xe extra: walk_completed Code miss in all TLB levels causes a page walk that completes. (All page sizes) ++name:idq_uops_not_delivered type:exclusive default:core ++ 0x1 extra: core Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled ++ 0x1 extra:cmask=4 cycles_0_uops_deliv_core Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled ++ 0x1 extra:cmask=3 cycles_le_1_uop_deliv_core Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled ++ 0x1 extra:cmask=2 cycles_le_2_uop_deliv_core Cycles with less than 2 uops delivered by the front end. ++ 0x1 extra:cmask=1 cycles_le_3_uop_deliv_core Cycles with less than 3 uops delivered by the front end. ++ 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. ++name:uops_dispatched_port type:exclusive default:0x1 ++ 0x1 extra: port_0 Cycles per thread when uops are executed in port 0 ++ 0x2 extra: port_1 Cycles per thread when uops are executed in port 1 ++ 0x4 extra: port_2 Cycles per thread when uops are executed in port 2 ++ 0x8 extra: port_3 Cycles per thread when uops are executed in port 3 ++ 0x10 extra: port_4 Cycles per thread when uops are executed in port 4 ++ 0x20 extra: port_5 Cycles per thread when uops are executed in port 5 ++ 0x40 extra: port_6 Cycles per thread when uops are executed in port 6 ++ 0x80 extra: port_7 Cycles per thread when uops are executed in port 7 ++name:resource_stalls type:exclusive default:0x1 ++ 0x1 extra: any Resource-related stall cycles ++ 0x8 extra: sb Cycles stalled due to no store buffers available. (not including draining form sync). ++name:cycle_activity type:exclusive default:0x4 ++ 0x4 extra:cmask=4 stalls_total Total execution stalls. ++ 0x8 extra:cmask=8 cycles_l1d_miss Cycles while L1 cache miss demand load is outstanding. ++ 0xc extra:cmask=c stalls_l1d_miss Execution stalls while L1 cache miss demand load is outstanding. ++ 0x1 extra:cmask=1 cycles_l2_miss Cycles while L2 cache miss demand load is outstanding. ++ 0x5 extra:cmask=5 stalls_l2_miss Execution stalls while L2 cache miss demand load is outstanding. ++ 0x10 extra:cmask=10 cycles_mem_any Cycles while memory subsystem has an outstanding load. ++ 0x14 extra:cmask=14 stalls_mem_any Execution stalls while memory subsystem has an outstanding load. ++ 0x2 extra:cmask=2 cycles_l3_miss Cycles while L3 cache miss demand load is outstanding. ++ 0x6 extra:cmask=6 stalls_l3_miss Execution stalls while L3 cache miss demand load is outstanding. ++name:exe_activity type:exclusive default:0x1 ++ 0x1 extra: exe_bound_0_ports Cycles where no uops were executed, the Reservation Station was not empty, the Store Buffer was full and there was no outstanding load. ++ 0x2 extra: u1_ports_util Cycles total of 1 uop is executed on all ports and Reservation Station was not empty. ++ 0x4 extra: u2_ports_util Cycles total of 2 uops are executed on all ports and Reservation Station was not empty. ++ 0x8 extra: u3_ports_util Cycles total of 3 uops are executed on all ports and Reservation Station was not empty. ++ 0x10 extra: u4_ports_util Cycles total of 4 uops are executed on all ports and Reservation Station was not empty. ++ 0x40 extra: bound_on_stores Cycles where the Store Buffer was full and no outstanding load. ++name:lsd type:exclusive default:uops ++ 0x1 extra: uops Number of Uops delivered by the LSD. ++ 0x1 extra:cmask=1 cycles_active Cycles Uops delivered by the LSD, but didn't come from the decoder ++ 0x1 extra:cmask=4 cycles_4_uops Cycles 4 Uops delivered by the LSD, but didn't come from the decoder ++name:offcore_requests type:exclusive default:0x80 ++ 0x80 extra: all_requests Any memory transaction that reached the SQ. ++ 0x1 extra: demand_data_rd Demand Data Read requests sent to uncore ++ 0x2 extra: demand_code_rd Cacheable and noncachaeble code read requests ++ 0x4 extra: demand_rfo Demand RFO requests including regular RFOs, locks, ItoM ++ 0x8 extra: all_data_rd Demand and prefetch data reads ++ 0x10 extra: l3_miss_demand_data_rd Demand Data Read requests who miss L3 cache ++name:uops_executed type:exclusive default:thread ++ 0x1 extra: thread Counts the number of uops to be executed per-thread each cycle. ++ 0x2 extra: core Number of uops executed on the core. ++ 0x10 extra: x87 Counts the number of x87 uops dispatched. ++ 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatched to be executed on this thread. ++ 0x1 extra:cmask=1 cycles_ge_1_uop_exec Cycles where at least 1 uop was executed per-thread ++ 0x1 extra:cmask=2 cycles_ge_2_uops_exec Cycles where at least 2 uops were executed per-thread ++ 0x1 extra:cmask=3 cycles_ge_3_uops_exec Cycles where at least 3 uops were executed per-thread ++ 0x1 extra:cmask=4 cycles_ge_4_uops_exec Cycles where at least 4 uops were executed per-thread ++ 0x2 extra:cmask=1 core_cycles_ge_1 Cycles at least 1 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=2 core_cycles_ge_2 Cycles at least 2 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=3 core_cycles_ge_3 Cycles at least 3 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=4 core_cycles_ge_4 Cycles at least 4 micro-op is executed from any thread on physical core ++ 0x2 extra:cmask=1,inv core_cycles_none Cycles with no micro-ops executed from any thread on physical core ++name:tlb_flush type:exclusive default:0x1 ++ 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries ++ 0x20 extra: stlb_any STLB flush attempts ++name:uops_retired type:exclusive default:retire_slots ++ 0x2 extra: retire_slots Retirement slots used. ++ 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. ++ 0x1 extra:cmask=a,inv total_cycles Cycles with less than 10 actually retired uops. ++name:machine_clears type:exclusive default:0x1 ++ 0x1 extra:cmask=1,edge count Number of machine clears (nukes) of any type. ++ 0x2 extra: memory_ordering Counts the number of machine clears due to memory order conflicts. ++ 0x4 extra: smc Self-modifying code (SMC) detected. ++name:br_inst_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches All (macro) branch instructions retired. ++ 0x1 extra: conditional Conditional branch instructions retired. ++ 0x1 extra:pebs conditional_pebs Conditional branch instructions retired. ++ 0x2 extra: near_call Direct and indirect near call instructions retired. ++ 0x2 extra:pebs near_call_pebs Direct and indirect near call instructions retired. ++ 0x8 extra: near_return Return instructions retired. ++ 0x8 extra:pebs near_return_pebs Return instructions retired. ++ 0x10 extra: not_taken Not taken branch instructions retired. ++ 0x20 extra: near_taken Taken branch instructions retired. ++ 0x20 extra:pebs near_taken_pebs Taken branch instructions retired. ++ 0x40 extra: far_branch Far branch instructions retired. ++ 0x40 extra:pebs far_branch_pebs Far branch instructions retired. ++ 0x4 extra:pebs all_branches_pebs All (macro) branch instructions retired. ++name:br_misp_retired type:exclusive default:all_branches ++ 0x0 extra: all_branches All mispredicted macro branch instructions retired. ++ 0x1 extra: conditional Mispredicted conditional branch instructions retired. ++ 0x1 extra:pebs conditional_pebs Mispredicted conditional branch instructions retired. ++ 0x20 extra: near_taken number of near branch instructions retired that were mispredicted and taken. ++ 0x20 extra:pebs near_taken_pebs number of near branch instructions retired that were mispredicted and taken. ++ 0x4 extra:pebs all_branches_pebs Mispredicted macro branch instructions retired. ++name:fp_arith_inst_retired type:exclusive default:0x1 ++ 0x1 extra: scalar_double Number of SSE/AVX computational scalar double precision floating-point instructions retired. Each count represents 1 computation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x2 extra: scalar_single Number of SSE/AVX computational scalar single precision floating-point instructions retired. Each count represents 1 computation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x4 extra: u128b_packed_double Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired. Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x8 extra: u128b_packed_single Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired. Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x10 extra: u256b_packed_double Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired. Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++ 0x20 extra: u256b_packed_single Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired. Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ++name:hle_retired type:exclusive default:start ++ 0x1 extra: start Number of times an HLE execution started. ++ 0x2 extra: commit Number of times an HLE execution successfully committed ++ 0x4 extra: aborted Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra:pebs aborted_pebs Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts). ++ 0x10 extra: aborted_misc2 Number of times an HLE execution aborted due to hardware timer expiration. ++ 0x20 extra: aborted_misc3 Number of times an HLE execution aborted due to HLE-unfriendly instructions and certain unfriendly events (such as AD assists etc.). ++ 0x40 extra: aborted_misc4 Number of times an HLE execution aborted due to incompatible memory type ++ 0x80 extra: aborted_misc5 Number of times an HLE execution aborted due to unfriendly events (such as interrupts). ++name:rtm_retired type:exclusive default:start ++ 0x1 extra: start Number of times an RTM execution started. ++ 0x2 extra: commit Number of times an RTM execution successfully committed ++ 0x4 extra: aborted Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x4 extra:pebs aborted_pebs Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ++ 0x8 extra: aborted_misc1 Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts) ++ 0x10 extra: aborted_misc2 Number of times an RTM execution aborted due to uncommon conditions. ++ 0x20 extra: aborted_misc3 Number of times an RTM execution aborted due to HLE-unfriendly instructions ++ 0x40 extra: aborted_misc4 Number of times an RTM execution aborted due to incompatible memory type ++ 0x80 extra: aborted_misc5 Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt) ++name:mem_inst_retired type:exclusive default:stlb_miss_loads ++ 0x11 extra: stlb_miss_loads Number of load instructions retired with STLB miss ++ 0x11 extra:pebs stlb_miss_loads_pebs Number of load instructions retired with STLB miss ++ 0x12 extra: stlb_miss_stores Number of store instructions retired with STLB miss ++ 0x12 extra:pebs stlb_miss_stores_pebs Number of store instructions retired with STLB miss ++ 0x21 extra: lock_loads Number of lock load instructions retired ++ 0x21 extra:pebs lock_loads_pebs Number of lock load instructions retired ++ 0x41 extra: split_loads Number of load instructions retired with cache-line splits that may impact performance. ++ 0x41 extra:pebs split_loads_pebs Number of load instructions retired with cache-line splits that may impact performance. ++ 0x42 extra: split_stores Number of store instructions retired with line-split ++ 0x42 extra:pebs split_stores_pebs Number of store instructions retired with line-split ++ 0x81 extra: all_loads Number of load instructions retired ++ 0x81 extra:pebs all_loads_pebs Number of load instructions retired ++ 0x82 extra: all_stores Number of store instructions retired ++ 0x82 extra:pebs all_stores_pebs Number of store instructions retired ++name:mem_load_retired type:exclusive default:l1_hit ++ 0x1 extra: l1_hit Retired load instructions with L1 cache hits as data sources ++ 0x1 extra:pebs l1_hit_pebs Retired load instructions with L1 cache hits as data sources ++ 0x2 extra: l2_hit Retired load instructions with L2 cache hits as data sources ++ 0x2 extra:pebs l2_hit_pebs Retired load instructions with L2 cache hits as data sources ++ 0x4 extra: l3_hit Retired load instructions with L3 cache hits as data sources ++ 0x4 extra:pebs l3_hit_pebs Retired load instructions with L3 cache hits as data sources ++ 0x8 extra: l1_miss Retired load instructions missed L1 cache as data sources ++ 0x8 extra:pebs l1_miss_pebs Retired load instructions missed L1 cache as data sources ++ 0x10 extra: l2_miss Retired load instructions missed L2 cache as data sources ++ 0x10 extra:pebs l2_miss_pebs Retired load instructions missed L2 cache as data sources ++ 0x20 extra: l3_miss Retired load instructions missed L3 cache as data sources ++ 0x20 extra:pebs l3_miss_pebs Retired load instructions missed L3 cache as data sources ++ 0x40 extra: fb_hit Retired load instructions which data sources were load missed L1 but hit FB due to preceding miss to the same cache line with data not ready ++ 0x40 extra:pebs fb_hit_pebs Retired load instructions which data sources were load missed L1 but hit FB due to preceding miss to the same cache line with data not ready ++name:mem_load_l3_hit_retired type:exclusive default:xsnp_miss ++ 0x1 extra: xsnp_miss Retired load instructions which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. ++ 0x1 extra:pebs xsnp_miss_pebs Retired load instructions which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. ++ 0x2 extra: xsnp_hit Retired load instructions which data sources were L3 and cross-core snoop hits in on-pkg core cache ++ 0x2 extra:pebs xsnp_hit_pebs Retired load instructions which data sources were L3 and cross-core snoop hits in on-pkg core cache ++ 0x4 extra: xsnp_hitm Retired load instructions which data sources were HitM responses from shared L3 ++ 0x4 extra:pebs xsnp_hitm_pebs Retired load instructions which data sources were HitM responses from shared L3 ++ 0x8 extra: xsnp_none Retired load instructions which data sources were hits in L3 without snoops required ++ 0x8 extra:pebs xsnp_none_pebs Retired load instructions which data sources were hits in L3 without snoops required +diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c +index 24ed697..b1d5ecf 100644 +--- a/libop/op_cpu_type.c ++++ b/libop/op_cpu_type.c +@@ -121,6 +121,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { + { "APM X-Gene", "arm/armv8-xgene", CPU_ARM_V8_APM_XGENE, 6 }, + { "ARM Cortex-A57", "arm/armv8-ca57", CPU_ARM_V8_CA57, 6}, + { "ARM Cortex-A53", "arm/armv8-ca53", CPU_ARM_V8_CA53, 6}, ++ { "Intel Skylake microarchitecture", "i386/skylake", CPU_SKYLAKE, 4 }, + }; + + static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); +@@ -737,6 +738,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: ++ case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h +index 2bd00ce..9983f87 100644 +--- a/libop/op_cpu_type.h ++++ b/libop/op_cpu_type.h +@@ -101,6 +101,7 @@ typedef enum { + CPU_ARM_V8_APM_XGENE, /* APM X-Gene */ + CPU_ARM_V8_CA57, /* ARM Cortex-A57 */ + CPU_ARM_V8_CA53, /* ARM Cortex-A53 */ ++ CPU_SKYLAKE, /** < Intel Skylake microarchitecture */ + MAX_CPU_TYPE + } op_cpu; + +diff --git a/libop/op_events.c b/libop/op_events.c +index 2badc8e..f58d243 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1200,6 +1200,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: ++ case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h +index 8a7ed1c..994fec4 100644 +--- a/libop/op_hw_specific.h ++++ b/libop/op_hw_specific.h +@@ -154,6 +154,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) + case 0x4f: + case 0x56: + return CPU_BROADWELL; ++ case 0x4e: ++ case 0x5e: ++ return CPU_SKYLAKE; + case 0x37: + case 0x4d: + case 0x4c: +diff --git a/utils/ophelp.c b/utils/ophelp.c +index a80fec8..fdddddc 100644 +--- a/utils/ophelp.c ++++ b/utils/ophelp.c +@@ -542,6 +542,7 @@ int main(int argc, char const * argv[]) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: ++ case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +-- +2.4.3 + +From ccc38adf33e3ae845e0b7c4f8fe77beceaa7b930 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Mon, 6 Jul 2015 16:48:25 -0700 +Subject: [PATCH 2/3] oprofile: Fixes for Skylake event lists + +This fixes the review feedback for the Skylake event list. + +- Fix event codes for INST_RETIRED, CPU_CLK_UNHALTED. +- Fix OFFCORE_REQUESTS_OUTSTANDING events +- Add br_inst_retired.all_branches_pebs +- Fill in correct default event +--- + events/i386/skylake/events | 4 ++-- + events/i386/skylake/unit_masks | 25 +++++++++++++------------ + libop/op_events.c | 5 ++++- + 3 files changed, 19 insertions(+), 15 deletions(-) + +diff --git a/events/i386/skylake/events b/events/i386/skylake/events +index 28d6654..9a04a86 100644 +--- a/events/i386/skylake/events ++++ b/events/i386/skylake/events +@@ -6,8 +6,6 @@ + # Note the minimum counts are not discovered experimentally and could be likely + # lowered in many cases without ill effect. + # +-event:0x00 counters:1 um:inst_retired minimum:2000003 name:inst_retired : +-event:0x00 counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : + event:0x03 counters:cpuid um:ld_blocks minimum:100003 name:ld_blocks : + event:0x07 counters:cpuid um:ld_blocks_partial minimum:100003 name:ld_blocks_partial_address_alias : + event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000003 name:dtlb_load_misses : +@@ -16,6 +14,7 @@ event:0x0e counters:cpuid um:uops_issued minimum:2000003 name:uops_issued : + event:0x14 counters:cpuid um:arith minimum:2000003 name:arith_divider_active : + event:0x24 counters:cpuid um:l2_rqsts minimum:200003 name:l2_rqsts : + event:0x2e counters:cpuid um:longest_lat_cache minimum:100003 name:longest_lat_cache : ++event:0x3c counters:cpuid um:cpu_clk_unhalted minimum:2000003 name:cpu_clk_unhalted : + event:0x3c counters:cpuid um:cpu_clk_thread_unhalted minimum:2000003 name:cpu_clk_thread_unhalted : + event:0x48 counters:cpuid um:l1d_pend_miss minimum:2000003 name:l1d_pend_miss : + event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000003 name:dtlb_store_misses : +@@ -44,6 +43,7 @@ event:0xb0 counters:cpuid um:offcore_requests minimum:100003 name:offcore_reques + event:0xb1 counters:cpuid um:uops_executed minimum:2000003 name:uops_executed : + event:0xb2 counters:cpuid um:offcore_requests_buffer minimum:2000003 name:offcore_requests_buffer_sq_full : + event:0xbd counters:cpuid um:tlb_flush minimum:100007 name:tlb_flush : ++event:0xc0 counters:1 um:inst_retired minimum:2000003 name:inst_retired : + event:0xc1 counters:cpuid um:other_assists minimum:100003 name:other_assists_any : + event:0xc2 counters:cpuid um:uops_retired minimum:2000003 name:uops_retired : + event:0xc3 counters:cpuid um:machine_clears minimum:100003 name:machine_clears : +diff --git a/events/i386/skylake/unit_masks b/events/i386/skylake/unit_masks +index 98ed65c..b505769 100644 +--- a/events/i386/skylake/unit_masks ++++ b/events/i386/skylake/unit_masks +@@ -37,16 +37,6 @@ name:offcore_requests_buffer type:mandatory default:0x1 + 0x1 extra: sq_full Offcore requests buffer cannot take more entries for this thread core. + name:other_assists type:mandatory default:0x3f + 0x3f extra: any Number of times a microcode assist is invoked by HW other than FP-assist. Examples include AD (page Access Dirty) and AVX* related assists. +-name:inst_retired type:exclusive default:any +- 0x1 extra: any Instructions retired from execution.mem +- 0x0 extra: any_p Number of instructions retired. General Counter - architectural event +- 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution +-name:cpu_clk_unhalted type:exclusive default:thread +- 0x2 extra: thread Core cycles when the thread is not in halt state +- 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. +- 0x0 extra: thread_p Thread cycles when thread is not in halt state +- 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state +- 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state + name:ld_blocks type:exclusive default:0x2 + 0x2 extra: store_forward loads blocked by overlapping with store buffer that cannot be forwarded . + 0x8 extra: no_sr The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use +@@ -85,6 +75,12 @@ name:l2_rqsts type:exclusive default:0x21 + name:longest_lat_cache type:exclusive default:0x41 + 0x41 extra: miss Core-originated cacheable demand requests missed L3 + 0x4f extra: reference Core-originated cacheable demand requests that refer to L3 ++name:cpu_clk_unhalted type:exclusive default:thread ++ 0x2 extra: thread Core cycles when the thread is not in halt state ++ 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. ++ 0x0 extra: thread_p Thread cycles when thread is not in halt state ++ 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state ++ 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state + name:cpu_clk_thread_unhalted type:exclusive default:ref_xclk + 0x1 extra: ref_xclk Reference cycles when the thread is unhalted (counts at 100 MHz rate) + 0x2 extra: one_thread_active Count XClk pulses when this thread is unhalted and the other thread is halted. +@@ -119,8 +115,8 @@ name:rs_events type:exclusive default:empty_cycles + 0x1 extra:cmask=1,inv,edge empty_end Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues. + name:offcore_requests_outstanding type:exclusive default:demand_data_rd + 0x1 extra: demand_data_rd Offcore outstanding Demand Data Read transactions in uncore queue. +- 0x2 extra:cmask=1 demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. +- 0x4 extra:cmask=1 demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle ++ 0x2 extra: demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore. ++ 0x4 extra: demand_rfo Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle + 0x8 extra: all_data_rd Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore + 0x10 extra: l3_miss_demand_data_rd Counts number of Offcore outstanding Demand Data Read requests who miss L3 cache in the superQ every cycle. + 0x1 extra:cmask=1 cycles_with_demand_data_rd Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore +@@ -217,6 +213,10 @@ name:uops_executed type:exclusive default:thread + name:tlb_flush type:exclusive default:0x1 + 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries + 0x20 extra: stlb_any STLB flush attempts ++name:inst_retired type:exclusive default:any ++ 0x1 extra: any Instructions retired from execution.mem ++ 0x0 extra: any_p Number of instructions retired. General Counter - architectural event ++ 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution + name:uops_retired type:exclusive default:retire_slots + 0x2 extra: retire_slots Retirement slots used. + 0x1 extra:cmask=1,inv stall_cycles Cycles without actually retired uops. +@@ -231,6 +231,7 @@ name:br_inst_retired type:exclusive default:all_branches + 0x1 extra:pebs conditional_pebs Conditional branch instructions retired. + 0x2 extra: near_call Direct and indirect near call instructions retired. + 0x2 extra:pebs near_call_pebs Direct and indirect near call instructions retired. ++ 0x0 extra:pebs all_branches_pebs All (macro) branch instructions retired. + 0x8 extra: near_return Return instructions retired. + 0x8 extra:pebs near_return_pebs Return instructions retired. + 0x10 extra: not_taken Not taken branch instructions retired. +diff --git a/libop/op_events.c b/libop/op_events.c +index f58d243..25f010e 100644 +--- a/libop/op_events.c ++++ b/libop/op_events.c +@@ -1200,7 +1200,6 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + case CPU_NEHALEM: + case CPU_HASWELL: + case CPU_BROADWELL: +- case CPU_SKYLAKE: + case CPU_SILVERMONT: + case CPU_WESTMERE: + case CPU_SANDYBRIDGE: +@@ -1213,6 +1212,10 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) + descr->name = "CPU_CLK_UNHALTED"; + break; + ++ case CPU_SKYLAKE: ++ descr->name = "cpu_clk_unhalted"; ++ break; ++ + case CPU_RTC: + descr->name = "RTC_INTERRUPTS"; + descr->count = 1024; +-- +2.4.3 + +From cfb3ddbaae4ca2e073b5229bf6019da766eb8da9 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Tue, 7 Jul 2015 11:02:38 -0700 +Subject: [PATCH 3/3] oprofile: Fix unit masks of fixed counters on Skylake + +Fix another issue noticed by William Cohen. + +The unit masks for the fixed counters were incorrect. + +Note that the fixed counters exist in two copies, as perf aliases +them to the generic counter version codes. +--- + events/i386/skylake/unit_masks | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/events/i386/skylake/unit_masks b/events/i386/skylake/unit_masks +index b505769..6e81a63 100644 +--- a/events/i386/skylake/unit_masks ++++ b/events/i386/skylake/unit_masks +@@ -76,8 +76,8 @@ name:longest_lat_cache type:exclusive default:0x41 + 0x41 extra: miss Core-originated cacheable demand requests missed L3 + 0x4f extra: reference Core-originated cacheable demand requests that refer to L3 + name:cpu_clk_unhalted type:exclusive default:thread +- 0x2 extra: thread Core cycles when the thread is not in halt state +- 0x3 extra: ref_tsc Reference cycles when the core is not in halt state. ++ 0x0 extra: thread Core cycles when the thread is not in halt state ++ 0x1 extra: ref_tsc Reference cycles when the core is not in halt state. + 0x0 extra: thread_p Thread cycles when thread is not in halt state + 0x2 extra:any thread_any Core cycles when at least one thread on the physical core is not in halt state + 0x0 extra:any thread_p_any Core cycles when at least one thread on the physical core is not in halt state +@@ -214,7 +214,7 @@ name:tlb_flush type:exclusive default:0x1 + 0x1 extra: dtlb_thread DTLB flush attempts of the thread-specific entries + 0x20 extra: stlb_any STLB flush attempts + name:inst_retired type:exclusive default:any +- 0x1 extra: any Instructions retired from execution.mem ++ 0x0 extra: any Instructions retired from execution.mem + 0x0 extra: any_p Number of instructions retired. General Counter - architectural event + 0x1 extra:pebs prec_dist Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution + name:uops_retired type:exclusive default:retire_slots +-- +2.4.3 + diff --git a/SOURCES/oprofile-xml2.patch b/SOURCES/oprofile-xml2.patch new file mode 100644 index 0000000..a1a7c17 --- /dev/null +++ b/SOURCES/oprofile-xml2.patch @@ -0,0 +1,134 @@ +commit 5646afee4c74a6759fc61d11b9203b0f6d60f529 +Author: Maynard Johnson +Date: Thu May 29 10:10:41 2014 -0500 + + opreport XML: binary-level count field issues + + See oprofile bug # 236 (https://sourceforge.net/p/oprofile/bugs/236/). + + There are several issues relating to the use of the 'count' element + defined in opreport.xsd. For example, below is the current schema + definition for the 'binary' element. Note the usage of the 'count' + element: + + + + + + + + + + + + + + There have been questions from users whether the 'count' element + associated with the 'binary' element is supposed to represent a + total count across all modules for the executable or if it is only + the count for the executable itself (the answer is the latter). + + Additionally, it's possible that there may be no samples at all + for the binary file -- i.e., all samples collected were for module + elements -- thus, the minOccurs attribute for the 'count' element + of 'binary' should be '0'. + + Finally, using xmllint on a XML instance document created from + opreport on a profile run that specified "--separate-cpu" identified + that the instance document was invalid when compared against its + associated schema file (opreport.xsd). Reviewing the schema, I + realized that all usages of the 'count' element were wrong insofar + as the maxOccurs attribute. Instead of being set to '1', maxOccurs + should be 'unbounded' since we can have multiple 'count' elements + associated with any given higher level element (e.g., 'binary') + if there are multiple classes in the profile. Multiple classes + will exist for a profile for various reasons -- e.g., profiling with + '--separate-cpu', or multiple events. + + This patch addresses these issues. The major version number of the + schema is not being changed -- only the minor number. This is because + instance documents that previously validated using the old schema + will still be valid with the new schema. + + A testsuite patch is being developed to validate XML instance documents + for various scenarios. + + Signed-off-by: Maynard Johnson + +diff --git a/doc/opreport.xsd b/doc/opreport.xsd +index 682a0bf..28e3128 100644 +--- a/doc/opreport.xsd ++++ b/doc/opreport.xsd +@@ -110,7 +110,7 @@ + + + +- ++ + + + +@@ -121,7 +121,7 @@ + + + +- ++ + + + +@@ -131,10 +131,13 @@ + + + +- ++ ++ + +- ++ + + + +@@ -144,7 +147,7 @@ + + + +- ++ + + + +@@ -203,7 +206,7 @@ + + + +- ++ + + + +diff --git a/libpp/xml_utils.cpp b/libpp/xml_utils.cpp +index 942b236..5f1a3a1 100644 +--- a/libpp/xml_utils.cpp ++++ b/libpp/xml_utils.cpp +@@ -245,11 +245,11 @@ void xml_utils::add_option(tag_t tag, bool value) + void xml_utils::output_xml_header(string const & command_options, + string const & cpu_info, string const & events) + { +- // the integer portion indicates the schema version and should change ++ // The integer portion indicates the schema version and should change + // both here and in the schema file when major changes are made to +- // the schema. changes to opreport, or minor changes to the schema ++ // the schema. Changes to opreport, or minor changes to the schema + // can be indicated by changes to the fraction part. +- string const schema_version = "3.0"; ++ string const schema_version = "3.1"; + + // This is the XML version, not schema version. + string const xml_header = ""; diff --git a/SPECS/oprofile.spec b/SPECS/oprofile.spec index 93c68e9..f90d023 100644 --- a/SPECS/oprofile.spec +++ b/SPECS/oprofile.spec @@ -1,7 +1,7 @@ Summary: System wide profiler Name: oprofile Version: 0.9.9 -Release: 7%{?dist} +Release: 16%{?dist} License: GPLv2+ and LGPLv2+ Group: Development/System # @@ -18,10 +18,21 @@ Patch305: oprofile-rhbz1121205.patch Patch400: oprofile-haswell.patch Patch401: oprofile-silvermont.patch Patch402: oprofile-broadwell.patch +Patch403: oprofile-intelcpuid.patch Patch500: oprofile-aarch64.patch Patch600: oprofile-power8.patch Patch601: oprofile-ppc64le.patch Patch602: oprofile-ppc64-equivalent.patch +Patch700: oprofile-hugepage.patch +Patch800: oprofile-defaultmask.patch +Patch801: oprofile-extramask.patch +Patch802: oprofile-maskarray.patch +Patch803: oprofile-env.patch +Patch804: oprofile-coverity.patch +Patch900: oprofile-ppc64jvm.patch +Patch1000: oprofile-skylake.patch +Patch1001: oprofile-remap.patch +Patch1002: oprofile-xml2.patch URL: http://oprofile.sf.net @@ -92,10 +103,21 @@ agent library. %patch400 -p1 -b .haswell %patch401 -p1 -b .silvermont %patch402 -p1 -b .broadwell +%patch403 -p1 %patch500 -p1 -b .aarch64 %patch600 -p1 -b .power8 %patch601 -p1 -b .ppc64le %patch602 -p1 +%patch700 -p1 +%patch800 -p1 +%patch801 -p1 +%patch802 -p1 +%patch803 -p1 +%patch804 -p1 +%patch900 -p1 +%patch1000 -p1 +%patch1001 -p1 +%patch1002 -p1 ./autogen.sh @@ -182,6 +204,28 @@ exit 0 %{_sysconfdir}/ld.so.conf.d/* %changelog +* Tue Aug 25 2015 William Cohen - 0.9.9-16 +- Improved handling of remapped anonymous regions +- Correct XML generation. + +* Wed Jul 8 2015 William Cohen - 0.9.9-15 +- Add support for Intel skylake processors. + +* Fri Jun 26 2015 William Cohen - 0.9.9-14 +- Recognize Intel Broadwell-DE. + +* Fri Jun 5 2015 William Cohen - 0.9.9-13 +- Further fix to allow operf to record information for Java anon_huges. + +* Fri Jun 5 2015 William Cohen - 0.9.9-12 +- Eliminate some coverity warnings. + +* Tue Apr 7 2015 William Cohen - 0.9.9-11 +- Avoid setting POSIXLY_CORRECT for the children tasks of operf and ocount. +- Fix handling of default unit masks longer than 11 char. +- Fix extra and default unitmasks selection. +- Allow operf to record information for Java anon_huges. + * Wed Oct 1 2014 Will Cohen - 0.9.9-7 - Correct identification power8le. rhbz1148525